View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(csv,
   36          [ csv//1,                     % +Rows
   37            csv//2,                     % +Rows, +Options
   38
   39            csv_read_file/2,            % +File, -Data
   40            csv_read_file/3,            % +File, -Data, +Options
   41            csv_read_file_row/3,        % +File, -Row, +Options
   42            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   43            csv_options/2,		% -Compiled, +Options
   44
   45            csv_write_file/2,           % +File, +Data
   46            csv_write_file/3,           % +File, +Data, +Options
   47            csv_write_stream/3          % +Stream, +Data, +Options
   48          ]).   49:- use_module(library(record)).   50:- use_module(library(error)).   51:- use_module(library(pure_input)).   52:- use_module(library(debug)).   53:- use_module(library(option)).

Process CSV (Comma-Separated Values) data

This library parses and generates CSV data. CSV data is represented in Prolog as a list of rows. Each row is a compound term, where all rows have the same name and arity.

See also: - RFC 4180 */
To be done: - Implement immediate assert of the data to avoid possible stack overflows.; - Writing creates an intermediate code-list, possibly overflowing resources. This waits for pure output!

   68:- predicate_options(csv//2, 2,
   69                     [ separator(nonneg),       % mustv be code
   70                       strip(boolean),
   71                       ignore_quotes(boolean),
   72                       convert(boolean),
   73                       case(oneof([down,preserve,up])),
   74                       functor(atom),
   75                       arity(-nonneg),          % actually ?nonneg
   76                       match_arity(boolean)
   77                     ]).   78:- predicate_options(csv_read_file/3, 3,
   79                     [ pass_to(csv//2, 2),
   80                       pass_to(phrase_from_file/3, 3)
   81                     ]).   82:- predicate_options(csv_read_file_row/3, 3,
   83                     [ pass_to(csv//2, 2),
   84                       pass_to(open/4, 4)
   85                     ]).   86:- predicate_options(csv_write_file/3, 3,
   87                     [ pass_to(csv//2, 2),
   88                       pass_to(open/4, 4)
   89                     ]).   90:- predicate_options(csv_write_stream/3, 3,
   91                     [ pass_to(csv//2, 2)
   92                     ]).   93
   94
   95:- record
   96    csv_options(separator:integer=0',,
   97                strip:boolean=false,
   98                ignore_quotes:boolean=false,
   99                convert:boolean=true,
  100                case:oneof([down,preserve,up])=preserve,
  101                functor:atom=row,
  102                arity:integer,
  103                match_arity:boolean=true).

csv_read_file(+File, -Rows) is det

csv_read_file(+File, -Rows, +Options) is det

Read a CSV file into a list of rows. Each row is a Prolog term with the same arity. Options is handed to csv//2. Remaining options are processed by phrase_from_file/3. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

Suppose we want to create a predicate table/6 from a CSV file that we know contains 6 fields per record. This can be done using the code below. Without the option arity(6), this would generate a predicate table/N, where N is the number of fields per record in the data.

?- csv_read_file(File, Rows, [functor(table), arity(6)]),
   maplist(assert, Rows).

  127csv_read_file(File, Rows) :-
  128    csv_read_file(File, Rows, []).
  129
  130csv_read_file(File, Rows, Options) :-
  131    default_separator(File, Options, Options1),
  132    make_csv_options(Options1, Record, RestOptions),
  133    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  134
  135
  136default_separator(File, Options0, Options) :-
  137    (   option(separator(_), Options0)
  138    ->  Options = Options0
  139    ;   file_name_extension(_, Ext0, File),
  140        downcase_atom(Ext0, Ext),
  141        ext_separator(Ext, Sep)
  142    ->  Options = [separator(Sep)|Options0]
  143    ;   Options = Options0
  144    ).
  145
  146ext_separator(csv, 0',).
  147ext_separator(tsv, 0'\t).

csv(?Rows)// is det

csv(?Rows, +Options)// is det

Prolog DCG to `read/write' CSV data. Options:

separator(+Code): The comma-separator. Must be a character code. Default is (of course) the comma. Character codes can be specified using the 0' notion. E.g., using separator(0';) parses a semicolon separated file.
ignore_quotes(+Boolean): If true (default false), threat double quotes as a normal character.
strip(+Boolean): If true (default false), strip leading and trailing blank space. RFC4180 says that blank space is part of the data.
convert(+Boolean): If true (default), use name/2 on the field data. This translates the field into a number if possible.
case(+Action): If down, downcase atomic values. If up, upcase them and if preserve (default), do not change the case.
functor(+Atom): Functor to use for creating row terms. Default is row.
arity(?Arity): Number of fields in each row. This predicate raises a domain_error(row_arity(Expected), Found) if a row is found with different arity.
match_arity(+Boolean): If false (default true), do not reject CSV files where lines provide a varying number of fields (columns). This can be a work-around to use some incorrect CSV files.

  191csv(Rows) -->
  192    csv(Rows, []).
  193
  194csv(Rows, Options) -->
  195    { make_csv_options(Options, Record, _) },
  196    csv_roptions(Rows, Record).
  197
  198csv_roptions(Rows, Record) -->
  199    { ground(Rows) },
  200    !,
  201    emit_csv(Rows, Record).
  202csv_roptions(Rows, Record) -->
  203    csv_data(Rows, Record).
  204
  205csv_data([], _) -->
  206    eof,
  207    !.
  208csv_data([Row|More], Options) -->
  209    row(Row, Options),
  210    !,
  211    { debug(csv, 'Row: ~p', [Row]) },
  212    csv_data(More, Options).
  213
  214eof([], []).
  215
  216row(Row, Options) -->
  217    fields(Fields, Options),
  218    { csv_options_functor(Options, Functor),
  219      Row =.. [Functor|Fields],
  220      functor(Row, _, Arity),
  221      check_arity(Options, Arity)
  222    }.
  223
  224check_arity(Options, Arity) :-
  225    csv_options_arity(Options, Arity),
  226    !.
  227check_arity(Options, _) :-
  228    csv_options_match_arity(Options, false),
  229    !.
  230check_arity(Options, Arity) :-
  231    csv_options_arity(Options, Expected),
  232    domain_error(row_arity(Expected), Arity).
  233
  234fields([F|T], Options) -->
  235    field(F, Options),
  236    (   separator(Options)
  237    ->  fields(T, Options)
  238    ;   end_of_record
  239    ->  { T = [] }
  240    ).
  241
  242field(Value, Options) -->
  243    "\"",
  244    { csv_options_ignore_quotes(Options, false) },
  245    !,
  246    string_codes(Codes),
  247    { make_value(Codes, Value, Options) }.
  248field(Value, Options) -->
  249    { csv_options_strip(Options, true) },
  250    !,
  251    stripped_field(Value, Options).
  252field(Value, Options) -->
  253    { csv_options_separator(Options, Sep) },
  254    field_codes(Codes, Sep),
  255    { make_value(Codes, Value, Options) }.
  256
  257
  258stripped_field(Value, Options) -->
  259    ws,
  260    (   "\"",
  261        { csv_options_strip(Options, false) }
  262    ->  string_codes(Codes),
  263        ws
  264    ;   { csv_options_separator(Options, Sep) },
  265        field_codes(Codes0, Sep),
  266        { strip_trailing_ws(Codes0, Codes) }
  267    ),
  268    { make_value(Codes, Value, Options) }.
  269
  270ws --> " ", !, ws.
  271ws --> "\t", !, ws.
  272ws --> "".
  273
  274strip_trailing_ws(List, Stripped) :-
  275    append(Stripped, WS, List),
  276    all_ws(WS).
  277
  278all_ws([]).
  279all_ws([32|T]) :- all_ws(T).
  280all_ws([9|T]) :- all_ws(T).

string_codes(-Codes): Process a double-quotes string where the quote is escaped by doubling it. Eats the terminating double-quote.

  288string_codes(List) -->
  289    [H],
  290    (   { H == 0'" }
  291    ->  (   "\""
  292        ->  { List = [H|T] },
  293            string_codes(T)
  294        ;   { List = [] }
  295        )
  296    ;   { List = [H|T] },
  297        string_codes(T)
  298    ).
  299
  300field_codes([], Sep), [Sep] --> [Sep], !.
  301field_codes([], _), "\n" --> "\r\n", !.
  302field_codes([], _), "\n" --> "\n", !.
  303field_codes([], _), "\n" --> "\r", !.
  304field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  305field_codes([], _) --> [].              % unterminated last record

make_value(+Codes, -Value, +Options) is det: Convert a list of character codes to the actual value, depending on Options.

  312make_value(Codes, Value, Options) :-
  313    csv_options_convert(Options, Convert),
  314    csv_options_case(Options, Case),
  315    make_value(Convert, Case, Codes, Value).
  316
  317make_value(true, preserve, Codes, Value) :-
  318    !,
  319    name(Value, Codes).
  320make_value(true, Case, Codes, Value) :-
  321    !,
  322    (   number_string(Value, Codes)
  323    ->  true
  324    ;   make_value(false, Case, Codes, Value)
  325    ).
  326make_value(false, preserve, Codes, Value) :-
  327    !,
  328    atom_codes(Value, Codes).
  329make_value(false, down, Codes, Value) :-
  330    !,
  331    string_codes(String, Codes),
  332    downcase_atom(String, Value).
  333make_value(false, up, Codes, Value) :-
  334    string_codes(String, Codes),
  335    upcase_atom(String, Value).
  336
  337separator(Options) -->
  338    { csv_options_separator(Options, Sep) },
  339    [Sep].
  340
  341end_of_record --> "\n".			% Unix files
  342end_of_record --> "\r\n".               % DOS files
  343end_of_record --> "\r".                 % MacOS files
  344end_of_record --> eof.                  % unterminated last record

csv_read_file_row(+File, -Row, +Options) is nondet

True when Row is a row in File. First unifies Row with the first row in File. Backtracking yields the second, ... row. This interface is an alternative to csv_read_file/3 that avoids loading all rows in memory. Note that this interface does not guarantee that all rows in File have the same arity.

In addition to the options of csv_read_file/3, this predicate processes the option:

line(-Line): Line is unified with the 1-based line-number from which Row is read. Note that Line is not the physical line, but rather the logical record number.

To be done: - Input is read line by line. If a record separator is embedded in a quoted field, parsing the record fails and another line is added to the input. This does not nicely deal with other reasons why parsing the row may fail.

  368csv_read_file_row(File, Row, Options) :-
  369    default_separator(File, Options, Options1),
  370    make_csv_options(Options1, RecordOptions, Options2),
  371    select_option(line(Line), Options2, RestOptions, _),
  372    setup_call_cleanup(
  373        open(File, read, Stream, RestOptions),
  374        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  375        close(Stream)).
  376
  377csv_read_stream_row(Stream, Row, Line, Options) :-
  378    between(1, infinite, Line),
  379    (   csv_read_row(Stream, Row0, Options),
  380        Row0 \== end_of_file
  381    ->  Row = Row0
  382    ;   !,
  383        fail
  384    ).

csv_read_row(+Stream, -Row, +CompiledOptions) is det: Read the next CSV record from Stream and unify the result with Row. CompiledOptions is created from options defined for csv//2 using csv_options/2. Row is unified with end_of_file upon reaching the end of the input.

  394csv_read_row(Stream, Row, _Record) :-
  395    at_end_of_stream(Stream),
  396    !,
  397    Row = end_of_file.
  398csv_read_row(Stream, Row, Record) :-
  399    read_lines_to_codes(Stream, Codes, Record, even),
  400    phrase(row(Row0, Record), Codes),
  401    !,
  402    Row = Row0.
  403
  404read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  405    read_line_to_codes(Stream, Codes0),
  406    Codes0 \== end_of_file,
  407    (   (   csv_options_ignore_quotes(Options, true)
  408        ;   check_quotes(Codes0, QuoteQuantity, even)
  409        )
  410    ->  Codes = Codes0
  411    ;   append(Codes0, [0'\n|Tail], Codes),
  412        read_lines_to_codes(Stream, Tail, Options, odd)
  413    ).
  414
  415check_quotes([], QuoteQuantity, QuoteQuantity) :-
  416    !.
  417check_quotes([0'"|T], odd, Result) :-
  418    !,
  419    check_quotes(T, even, Result).
  420check_quotes([0'"|T], even, Result) :-
  421    !,
  422    check_quotes(T, odd, Result).
  423check_quotes([_|T], QuoteQuantity, Result) :-
  424    check_quotes(T, QuoteQuantity, Result).

csv_options(-Compiled, +Options) is det: Compiled is the compiled representation of the CSV processing options as they may be passed into csv//2, etc. This predicate is used in combination with csv_read_row/3 to avoid repeated processing of the options.

  434csv_options(Compiled, Options) :-
  435    make_csv_options(Options, Compiled, _Ignored).
  436
  437
  438                /*******************************
  439                *             OUTPUT           *
  440                *******************************/

csv_write_file(+File, +Data) is det
csv_write_file(+File, +Data, +Options) is det: Write a list of Prolog terms to a CSV file. Options are given to csv//2. Remaining options are given to open/4. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

  450csv_write_file(File, Data) :-
  451    csv_write_file(File, Data, []).
  452
  453csv_write_file(File, Data, Options) :-
  454    must_be(list, Data),
  455    default_separator(File, Options, Options1),
  456    make_csv_options(Options1, Record, RestOptions),
  457    phrase(emit_csv(Data, Record), String),
  458    setup_call_cleanup(
  459        open(File, write, Out, RestOptions),
  460        format(Out, '~s', [String]),
  461        close(Out)).
  462
  463
  464emit_csv([], _) --> [].
  465emit_csv([H|T], Options) -->
  466    emit_row(H, Options), "\r\n",   % RFC 4180 demands \r\n
  467    emit_csv(T, Options).
  468
  469emit_row(Row, Options) -->
  470    { Row =.. [_|Fields] },
  471    emit_fields(Fields, Options).
  472
  473emit_fields([H|T], Options) -->
  474    emit_field(H, Options),
  475    (   { T == [] }
  476        ->  []
  477        ;   { csv_options_separator(Options, Sep) },
  478        [Sep],
  479        emit_fields(T, Options)
  480    ).
  481
  482emit_field(H, Options) -->
  483    { (   atom(H)
  484      ->  atom_codes(H, Codes)
  485      ;   string(H)
  486      ->  string_codes(H, Codes)
  487      )
  488    },
  489    !,
  490    (   { needs_quotes(H, Options) }
  491    ->  "\"", emit_string(Codes), "\""
  492    ;   emit_codes(Codes)
  493    ).
  494emit_field([], _) -->
  495    !,
  496    { atom_codes('[]', Codes) },
  497    emit_codes(Codes).
  498emit_field(H, _) -->
  499    { number_codes(H,Codes) },
  500    emit_codes(Codes).
  501
  502needs_quotes(Atom, _) :-
  503    sub_atom(Atom, _, _, _, '"'),
  504    !.
  505needs_quotes(Atom, _) :-
  506    sub_atom(Atom, _, _, _, '\n'),
  507    !.
  508needs_quotes(Atom, _) :-
  509    sub_atom(Atom, _, _, _, '\r'),
  510    !.
  511needs_quotes(Atom, Options) :-
  512    csv_options_separator(Options, Sep),
  513    char_code(Char, Sep),
  514    sub_atom(Atom, _, _, _, Char),
  515    !.
  516
  517emit_string([]) --> "".
  518emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  519emit_string([H|T]) --> [H], emit_string(T).
  520
  521emit_codes([]) --> "".
  522emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  523emit_codes([H|T]) --> [H], emit_codes(T).

csv_write_stream(+Stream, +Data, +Options) is det

Write the rows in Data to Stream. This is similar to csv_write_file/3, but can deal with data that is produced incrementally. The example below saves all answers from the predicate data/3 to File.

save_data(File) :-
   setup_call_cleanup(
       open(File, write, Out),
       forall(data(C1,C2,C3),
              csv_write_stream(Out, [row(C1,C2,C3)], [])),
       close(Out)),

  542csv_write_stream(Stream, Data, Options) :-
  543    must_be(list, Data),
  544    make_csv_options(Options, Record, _),
  545    phrase(emit_csv(Data, Record), String),
  546    format(Stream, '~s', [String])