csv.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(csv,
   36          [ csv//1,                     % +Rows
   37            csv//2,                     % +Rows, +Options
   38
   39            csv_read_file/2,            % +File, -Data
   40            csv_read_file/3,            % +File, -Data, +Options
   41            csv_read_file_row/3,        % +File, -Row, +Options
   42            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   43            csv_options/2,		% -Compiled, +Options
   44
   45            csv_write_file/2,           % +File, +Data
   46            csv_write_file/3,           % +File, +Data, +Options
   47            csv_write_stream/3          % +Stream, +Data, +Options
   48          ]).   49:- use_module(library(record)).   50:- use_module(library(error)).   51:- use_module(library(pure_input)).   52:- use_module(library(debug)).   53:- use_module(library(option)).   54
   55/** <module> Process CSV (Comma-Separated Values) data
   56
   57This library parses and generates CSV data.   CSV data is represented in
   58Prolog as a list of rows. Each row   is  a compound term, where all rows
   59have the same name and arity.
   60
   61@tbd    Implement immediate assert of the data to avoid possible stack
   62        overflows.
   63@tbd    Writing creates an intermediate code-list, possibly overflowing
   64        resources.  This waits for pure output!
   65@see RFC 4180
   66*/
   67
   68:- predicate_options(csv//2, 2,
   69                     [ separator(nonneg),       % mustv be code
   70                       strip(boolean),
   71                       ignore_quotes(boolean),
   72                       convert(boolean),
   73                       case(oneof([down,preserve,up])),
   74                       functor(atom),
   75                       arity(-nonneg),          % actually ?nonneg
   76                       match_arity(boolean)
   77                     ]).   78:- predicate_options(csv_read_file/3, 3,
   79                     [ pass_to(csv//2, 2),
   80                       pass_to(phrase_from_file/3, 3)
   81                     ]).   82:- predicate_options(csv_read_file_row/3, 3,
   83                     [ pass_to(csv//2, 2),
   84                       pass_to(open/4, 4)
   85                     ]).   86:- predicate_options(csv_write_file/3, 3,
   87                     [ pass_to(csv//2, 2),
   88                       pass_to(open/4, 4)
   89                     ]).   90:- predicate_options(csv_write_stream/3, 3,
   91                     [ pass_to(csv//2, 2)
   92                     ]).   93
   94
   95:- record
   96    csv_options(separator:integer=0',,
   97                strip:boolean=false,
   98                ignore_quotes:boolean=false,
   99                convert:boolean=true,
  100                case:oneof([down,preserve,up])=preserve,
  101                functor:atom=row,
  102                arity:integer,
  103                match_arity:boolean=true).  104
  105
  106%!  csv_read_file(+File, -Rows) is det.
  107%!  csv_read_file(+File, -Rows, +Options) is det.
  108%
  109%   Read a CSV file into a list of   rows. Each row is a Prolog term
  110%   with the same arity. Options  is   handed  to  csv//2. Remaining
  111%   options  are  processed  by    phrase_from_file/3.  The  default
  112%   separator depends on the file name   extension and is =|\t|= for
  113%   =|.tsv|= files and =|,|= otherwise.
  114%
  115%   Suppose we want to create a predicate   table/6  from a CSV file
  116%   that we know contains 6 fields  per   record.  This  can be done
  117%   using the code below. Without the   option  arity(6), this would
  118%   generate a predicate table/N, where N   is  the number of fields
  119%   per record in the data.
  120%
  121%       ==
  122%       ?- csv_read_file(File, Rows, [functor(table), arity(6)]),
  123%          maplist(assert, Rows).
  124%       ==
  125
  126
  127csv_read_file(File, Rows) :-
  128    csv_read_file(File, Rows, []).
  129
  130csv_read_file(File, Rows, Options) :-
  131    default_separator(File, Options, Options1),
  132    make_csv_options(Options1, Record, RestOptions),
  133    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  134
  135
  136default_separator(File, Options0, Options) :-
  137    (   option(separator(_), Options0)
  138    ->  Options = Options0
  139    ;   file_name_extension(_, Ext0, File),
  140        downcase_atom(Ext0, Ext),
  141        ext_separator(Ext, Sep)
  142    ->  Options = [separator(Sep)|Options0]
  143    ;   Options = Options0
  144    ).
  145
  146ext_separator(csv, 0',).
  147ext_separator(tsv, 0'\t).
  148
  149
  150%!  csv(?Rows)// is det.
  151%!  csv(?Rows, +Options)// is det.
  152%
  153%   Prolog DCG to `read/write' CSV data.  Options:
  154%
  155%       * separator(+Code)
  156%       The comma-separator.  Must be a character code.  Default is
  157%       (of course) the comma. Character codes can be specified
  158%       using the 0' notion. E.g., using =|separator(0';)|= parses
  159%       a semicolon separated file.
  160%
  161%       * ignore_quotes(+Boolean)
  162%       If =true= (default false), threat double quotes as a normal
  163%       character.
  164%
  165%       * strip(+Boolean)
  166%       If =true= (default =false=), strip leading and trailing
  167%       blank space.  RFC4180 says that blank space is part of the
  168%       data.
  169%
  170%       * convert(+Boolean)
  171%       If =true= (default), use name/2 on the field data.  This
  172%       translates the field into a number if possible.
  173%
  174%       * case(+Action)
  175%       If =down=, downcase atomic values.  If =up=, upcase them
  176%       and if =preserve= (default), do not change the case.
  177%
  178%       * functor(+Atom)
  179%       Functor to use for creating row terms.  Default is =row=.
  180%
  181%       * arity(?Arity)
  182%       Number of fields in each row.  This predicate raises
  183%       a domain_error(row_arity(Expected), Found) if a row is
  184%       found with different arity.
  185%
  186%       * match_arity(+Boolean)
  187%       If =false= (default =true=), do not reject CSV files where
  188%       lines provide a varying number of fields (columns).  This
  189%       can be a work-around to use some incorrect CSV files.
  190
  191csv(Rows) -->
  192    csv(Rows, []).
  193
  194csv(Rows, Options) -->
  195    { make_csv_options(Options, Record, _) },
  196    csv_roptions(Rows, Record).
  197
  198csv_roptions(Rows, Record) -->
  199    { ground(Rows) },
  200    !,
  201    emit_csv(Rows, Record).
  202csv_roptions(Rows, Record) -->
  203    csv_data(Rows, Record).
  204
  205csv_data([], _) -->
  206    eof,
  207    !.
  208csv_data([Row|More], Options) -->
  209    row(Row, Options),
  210    !,
  211    { debug(csv, 'Row: ~p', [Row]) },
  212    csv_data(More, Options).
  213
  214eof([], []).
  215
  216row(Row, Options) -->
  217    fields(Fields, Options),
  218    { csv_options_functor(Options, Functor),
  219      Row =.. [Functor|Fields],
  220      functor(Row, _, Arity),
  221      check_arity(Options, Arity)
  222    }.
  223
  224check_arity(Options, Arity) :-
  225    csv_options_arity(Options, Arity),
  226    !.
  227check_arity(Options, _) :-
  228    csv_options_match_arity(Options, false),
  229    !.
  230check_arity(Options, Arity) :-
  231    csv_options_arity(Options, Expected),
  232    domain_error(row_arity(Expected), Arity).
  233
  234fields([F|T], Options) -->
  235    field(F, Options),
  236    (   separator(Options)
  237    ->  fields(T, Options)
  238    ;   end_of_record
  239    ->  { T = [] }
  240    ).
  241
  242field(Value, Options) -->
  243    "\"",
  244    { csv_options_ignore_quotes(Options, false) },
  245    !,
  246    string_codes(Codes),
  247    { make_value(Codes, Value, Options) }.
  248field(Value, Options) -->
  249    { csv_options_strip(Options, true) },
  250    !,
  251    stripped_field(Value, Options).
  252field(Value, Options) -->
  253    { csv_options_separator(Options, Sep) },
  254    field_codes(Codes, Sep),
  255    { make_value(Codes, Value, Options) }.
  256
  257
  258stripped_field(Value, Options) -->
  259    ws,
  260    (   "\"",
  261        { csv_options_strip(Options, false) }
  262    ->  string_codes(Codes),
  263        ws
  264    ;   { csv_options_separator(Options, Sep) },
  265        field_codes(Codes0, Sep),
  266        { strip_trailing_ws(Codes0, Codes) }
  267    ),
  268    { make_value(Codes, Value, Options) }.
  269
  270ws --> " ", !, ws.
  271ws --> "\t", !, ws.
  272ws --> "".
  273
  274strip_trailing_ws(List, Stripped) :-
  275    append(Stripped, WS, List),
  276    all_ws(WS).
  277
  278all_ws([]).
  279all_ws([32|T]) :- all_ws(T).
  280all_ws([9|T]) :- all_ws(T).
  281
  282
  283%!  string_codes(-Codes)
  284%
  285%   Process a double-quotes string where  the   quote  is escaped by
  286%   doubling it. Eats the terminating double-quote.
  287
  288string_codes(List) -->
  289    [H],
  290    (   { H == 0'" }
  291    ->  (   "\""
  292        ->  { List = [H|T] },
  293            string_codes(T)
  294        ;   { List = [] }
  295        )
  296    ;   { List = [H|T] },
  297        string_codes(T)
  298    ).
  299
  300field_codes([], Sep), [Sep] --> [Sep], !.
  301field_codes([], _), "\n" --> "\r\n", !.
  302field_codes([], _), "\n" --> "\n", !.
  303field_codes([], _), "\n" --> "\r", !.
  304field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  305field_codes([], _) --> [].              % unterminated last record
  306
  307%!  make_value(+Codes, -Value, +Options) is det.
  308%
  309%   Convert a list of character codes to the actual value, depending
  310%   on Options.
  311
  312make_value(Codes, Value, Options) :-
  313    csv_options_convert(Options, Convert),
  314    csv_options_case(Options, Case),
  315    make_value(Convert, Case, Codes, Value).
  316
  317make_value(true, preserve, Codes, Value) :-
  318    !,
  319    name(Value, Codes).
  320make_value(true, Case, Codes, Value) :-
  321    !,
  322    (   number_string(Value, Codes)
  323    ->  true
  324    ;   make_value(false, Case, Codes, Value)
  325    ).
  326make_value(false, preserve, Codes, Value) :-
  327    !,
  328    atom_codes(Value, Codes).
  329make_value(false, down, Codes, Value) :-
  330    !,
  331    string_codes(String, Codes),
  332    downcase_atom(String, Value).
  333make_value(false, up, Codes, Value) :-
  334    string_codes(String, Codes),
  335    upcase_atom(String, Value).
  336
  337separator(Options) -->
  338    { csv_options_separator(Options, Sep) },
  339    [Sep].
  340
  341end_of_record --> "\n".			% Unix files
  342end_of_record --> "\r\n".               % DOS files
  343end_of_record --> "\r".                 % MacOS files
  344end_of_record --> eof.                  % unterminated last record
  345
  346
  347%!  csv_read_file_row(+File, -Row, +Options) is nondet.
  348%
  349%   True when Row is a row in File.  First unifies Row with the first
  350%   row in File. Backtracking  yields  the   second,  ...  row.  This
  351%   interface  is  an  alternative  to  csv_read_file/3  that  avoids
  352%   loading all rows in memory.  Note   that  this interface does not
  353%   guarantee that all rows in File have the same arity.
  354%
  355%   In addition to the  options   of  csv_read_file/3, this predicate
  356%   processes the option:
  357%
  358%     * line(-Line)
  359%     Line is unified with the 1-based line-number from which Row is
  360%     read.  Note that Line is not the physical line, but rather the
  361%     _logical_ record number.
  362%
  363%   @tbd    Input is read line by line.  If a record separator is
  364%           embedded in a quoted field, parsing the record fails and
  365%           another line is added to the input.  This does not nicely
  366%           deal with other reasons why parsing the row may fail.
  367
  368csv_read_file_row(File, Row, Options) :-
  369    default_separator(File, Options, Options1),
  370    make_csv_options(Options1, RecordOptions, Options2),
  371    select_option(line(Line), Options2, RestOptions, _),
  372    setup_call_cleanup(
  373        open(File, read, Stream, RestOptions),
  374        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  375        close(Stream)).
  376
  377csv_read_stream_row(Stream, Row, Line, Options) :-
  378    between(1, infinite, Line),
  379    (   csv_read_row(Stream, Row0, Options),
  380        Row0 \== end_of_file
  381    ->  Row = Row0
  382    ;   !,
  383        fail
  384    ).
  385
  386
  387%!  csv_read_row(+Stream, -Row, +CompiledOptions) is det.
  388%
  389%   Read the next CSV record from Stream  and unify the result with Row.
  390%   CompiledOptions is created from  options   defined  for csv//2 using
  391%   csv_options/2. Row is unified with   `end_of_file` upon reaching the
  392%   end of the input.
  393
  394csv_read_row(Stream, Row, _Record) :-
  395    at_end_of_stream(Stream),
  396    !,
  397    Row = end_of_file.
  398csv_read_row(Stream, Row, Record) :-
  399    read_lines_to_codes(Stream, Codes, Record, even),
  400    phrase(row(Row0, Record), Codes),
  401    !,
  402    Row = Row0.
  403
  404read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  405    read_line_to_codes(Stream, Codes0),
  406    Codes0 \== end_of_file,
  407    (   (   csv_options_ignore_quotes(Options, true)
  408        ;   check_quotes(Codes0, QuoteQuantity, even)
  409        )
  410    ->  Codes = Codes0
  411    ;   append(Codes0, [0'\n|Tail], Codes),
  412        read_lines_to_codes(Stream, Tail, Options, odd)
  413    ).
  414
  415check_quotes([], QuoteQuantity, QuoteQuantity) :-
  416    !.
  417check_quotes([0'"|T], odd, Result) :-
  418    !,
  419    check_quotes(T, even, Result).
  420check_quotes([0'"|T], even, Result) :-
  421    !,
  422    check_quotes(T, odd, Result).
  423check_quotes([_|T], QuoteQuantity, Result) :-
  424    check_quotes(T, QuoteQuantity, Result).
  425
  426
  427%!  csv_options(-Compiled, +Options) is det.
  428%
  429%   Compiled is the  compiled  representation   of  the  CSV  processing
  430%   options as they may be passed into   csv//2,  etc. This predicate is
  431%   used in combination with csv_read_row/3 to avoid repeated processing
  432%   of the options.
  433
  434csv_options(Compiled, Options) :-
  435    make_csv_options(Options, Compiled, _Ignored).
  436
  437
  438                /*******************************
  439                *             OUTPUT           *
  440                *******************************/
  441
  442%!  csv_write_file(+File, +Data) is det.
  443%!  csv_write_file(+File, +Data, +Options) is det.
  444%
  445%   Write a list of Prolog terms to a CSV file.  Options are given
  446%   to csv//2.  Remaining options are given to open/4.  The  default
  447%   separator depends on the file name   extension and is =|\t|= for
  448%   =|.tsv|= files and =|,|= otherwise.
  449
  450csv_write_file(File, Data) :-
  451    csv_write_file(File, Data, []).
  452
  453csv_write_file(File, Data, Options) :-
  454    must_be(list, Data),
  455    default_separator(File, Options, Options1),
  456    make_csv_options(Options1, Record, RestOptions),
  457    phrase(emit_csv(Data, Record), String),
  458    setup_call_cleanup(
  459        open(File, write, Out, RestOptions),
  460        format(Out, '~s', [String]),
  461        close(Out)).
  462
  463
  464emit_csv([], _) --> [].
  465emit_csv([H|T], Options) -->
  466    emit_row(H, Options), "\r\n",   % RFC 4180 demands \r\n
  467    emit_csv(T, Options).
  468
  469emit_row(Row, Options) -->
  470    { Row =.. [_|Fields] },
  471    emit_fields(Fields, Options).
  472
  473emit_fields([H|T], Options) -->
  474    emit_field(H, Options),
  475    (   { T == [] }
  476        ->  []
  477        ;   { csv_options_separator(Options, Sep) },
  478        [Sep],
  479        emit_fields(T, Options)
  480    ).
  481
  482emit_field(H, Options) -->
  483    { (   atom(H)
  484      ->  atom_codes(H, Codes)
  485      ;   string(H)
  486      ->  string_codes(H, Codes)
  487      )
  488    },
  489    !,
  490    (   { needs_quotes(H, Options) }
  491    ->  "\"", emit_string(Codes), "\""
  492    ;   emit_codes(Codes)
  493    ).
  494emit_field([], _) -->
  495    !,
  496    { atom_codes('[]', Codes) },
  497    emit_codes(Codes).
  498emit_field(H, _) -->
  499    { number_codes(H,Codes) },
  500    emit_codes(Codes).
  501
  502needs_quotes(Atom, _) :-
  503    sub_atom(Atom, _, _, _, '"'),
  504    !.
  505needs_quotes(Atom, _) :-
  506    sub_atom(Atom, _, _, _, '\n'),
  507    !.
  508needs_quotes(Atom, _) :-
  509    sub_atom(Atom, _, _, _, '\r'),
  510    !.
  511needs_quotes(Atom, Options) :-
  512    csv_options_separator(Options, Sep),
  513    char_code(Char, Sep),
  514    sub_atom(Atom, _, _, _, Char),
  515    !.
  516
  517emit_string([]) --> "".
  518emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  519emit_string([H|T]) --> [H], emit_string(T).
  520
  521emit_codes([]) --> "".
  522emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  523emit_codes([H|T]) --> [H], emit_codes(T).
  524
  525
  526%%     csv_write_stream(+Stream, +Data, +Options) is det.
  527%
  528%      Write  the  rows  in  Data  to    Stream.   This  is  similar  to
  529%      csv_write_file/3,  but  can  deal  with  data  that  is  produced
  530%      incrementally. The example  below  saves   all  answers  from the
  531%      predicate data/3 to File.
  532%
  533%        ==
  534%        save_data(File) :-
  535%           setup_call_cleanup(
  536%               open(File, write, Out),
  537%               forall(data(C1,C2,C3),
  538%                      csv_write_stream(Out, [row(C1,C2,C3)], [])),
  539%               close(Out)),
  540%        ==
  541
  542csv_write_stream(Stream, Data, Options) :-
  543    must_be(list, Data),
  544    make_csv_options(Options, Record, _),
  545    phrase(emit_csv(Data, Record), String),
  546    format(Stream, '~s', [String])