Saturday, January 1, 2011

CSV in Erlang

Look Ma, no code (well, almost, in fact pattern matching does count as a code). Implementing state machine in Erlang is an amazingly easy task, the same pattern can be used with other languages which support tail recursion (Ruby and Javascript are not in the list though).
-module(csv).

-export([parse/1]).

parse(Data) -> parse(Data, [], [], []).

parse([$\r|Data], Field, Fields, Lines) -> parse_r(Data, Field, Fields, Lines);
parse([$\n|Data], Field, Fields, Lines) -> parse(Data, [], [], [[Field|Fields]|Lines]);
parse([$,|Data], Field, Fields, Lines)  -> parse(Data, [], [Field|Fields], Lines);
parse([$"|Data], [], Fields, Lines)     -> parse_q(Data, [], Fields, Lines);
parse([C|Data], Field, Fields, Lines)   -> parse(Data, [C|Field], Fields, Lines);
parse([], [], [], Lines)                -> lists:reverse(
                                               [lists:reverse(
                                                 [lists:reverse(F) || F <- L]
                                               ) || L <- Lines]
                                             );
parse([], Field, Fields, Lines)         -> parse([], [], [], [[Field|Fields]|Lines]).

parse_r([$\n|_] = Data, Field, Fields, Lines) -> parse(Data, Field, Fields, Lines).

parse_q([$"|Data], Field, Fields, Lines) -> parse_qq(Data, Field, Fields, Lines);
parse_q([C|Data], Field, Fields, Lines)  -> parse_q(Data, [C|Field], Fields, Lines).

parse_qq([$"|Data], Field, Fields, Lines) -> parse_q(Data, [$"|Field], Fields, Lines);
parse_qq([C|_] = Data, Field, Fields, Lines)  
  when C == $,; C == $\r; C == $\n        -> parse(Data, Field, Fields, Lines);
parse_qq([], Field, Fields, Lines)        -> parse([], Field, Fields, Lines).
More interesting about it is that we can use lazy evaluation to save memory when reading from file. The only problem is that [C|fun...] here is not a valid list (kind of pseudo list), so this syntax is somewhat misleading.

-module(csv).

-export([parse/1, lazy/1]).

-define(BUFFER_SIZE, 1024).

lazy(IO) -> lazy(IO, []).

lazy(IO, [C|S]) ->
  [C|fun()-> lazy(IO, S) end];
lazy(IO, []) ->
  case file:read(IO, ?BUFFER_SIZE) of
    {ok, [C|S]} ->
      [C|fun()-> lazy(IO, S) end];
    eof ->
      []
  end.

parse(Data) -> parse(Data, [], [], []).

parse([$\r|Data], Field, Fields, Lines) -> parse_r(Data(), Field, Fields, Lines);
parse([$\n|Data], Field, Fields, Lines) -> parse(Data(), [], [], [[Field|Fields]|Lines]);
parse([$,|Data], Field, Fields, Lines)  -> parse(Data(), [], [Field|Fields], Lines);
parse([$"|Data], [], Fields, Lines)     -> parse_q(Data(), [], Fields, Lines);
parse([C|Data], Field, Fields, Lines)   -> parse(Data(), [C|Field], Fields, Lines);
parse([], [], [], Lines)                -> lists:reverse(
                                               [lists:reverse(
                                                 [lists:reverse(F) || F <- L]
                                               ) || L <- Lines]
                                             );
parse([], Field, Fields, Lines)         -> parse([], [], [], [[Field|Fields]|Lines]).

parse_r([$\n|_] = Data, Field, Fields, Lines) -> parse(Data, Field, Fields, Lines).

parse_q([$"|Data], Field, Fields, Lines) -> parse_qq(Data(), Field, Fields, Lines);
parse_q([C|Data], Field, Fields, Lines)  -> parse_q(Data(), [C|Field], Fields, Lines).

parse_qq([$"|Data], Field, Fields, Lines)  -> parse_q(Data(), [$"|Field], Fields, Lines);
parse_qq([C|_] = Data, Field, Fields, Lines)  
  when C == $,; C == $\r; C == $\n         -> parse(Data, Field, Fields, Lines);
parse_qq([], Field, Fields, Lines)         -> parse([], Field, Fields, Lines).
So now we can call it as:

{ok,IO} = file:open("test.csv",[raw,read]),
csv:parse(csv:lazy(IO)).

Isn't it beautiful?

No comments:

Post a Comment