[m-users.] Read csv file with variable number and type of fields

Julien Fischer jfischer at opturion.com
Wed Sep 2 14:44:36 AEST 2015


Hi Dirk,

On Tue, 1 Sep 2015, Dirk Ziegemeyer wrote:

> I need to read csv files where the number and the type of fields in a
> record is not fixed in advance. Instead the record structure is
> determined by the header record of the csv file together with a
> database which assigns a data type to every header field name.
>
> I’m wondering if I can use the library https://github.com/juliensf/mercury-csv for that.
>
> It seems that I need to know the record structure in order to
> initialize a csv reader with csv.init_reader/3.

Yes, but there's nothing that prevents you from reading in the header
line, setting up the record structure base on that and then
initializing a new CSV reader based on that.

> My idea is to read the file in two steps:
> 1. the header line (in order to determine number and type of fields), e.g. with some DCG rules or module parsing_utils
> 2. the rest
>
> Is this a valid approach and can I combine io.read_line_as_string/3 to
> read the header line and stream.get/4 to read the rest of the file?

Yes, it would work.  It's going to be a bit fiddly as you are going to
have to handle things like quoted header names etc yourself.  A better
approach would be to use the CSV library's raw_reader to read in just
the header line, set up the record structure based on that, and then
intialize a new reader to handle the rest of the data.

I've attached a small example of how this could be done.

Julien.
-------------- next part --------------
%-----------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%-----------------------------------------------------------------------------%
%
% Read CSV data from the standard input.  Use the header fields to determine
% the structure of the rest of the file.
%
% Fields whose header has the prefix "INT_" are treated as integer
% fields.
% Fields whose header has the prefix "FLOAT_" are treated as floats fields.
% Fields whose header has the prefix "STRING_" are treated as strings.
% Fields whose header has the prefix "DATE_" are treated as dates in
% YYYY-MM-DD format.
% Field whose header does not match one of the above are discarded.
%
%-----------------------------------------------------------------------------%

:- module infer_desc_ex.
:- interface.

:- import_module io.

:- pred main(io::di, io::uo) is det.

%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%

:- implementation.

:- import_module csv.
:- import_module infer_desc_from_header.

:- import_module list.
:- import_module stream.
:- import_module string.

%-----------------------------------------------------------------------------%

main(!IO) :-
    io.stdin_stream(Stdin, !IO),
    init_with_inferred_desc(Stdin, infer_pred, InitReaderResult, !IO),
    (
        InitReaderResult = ok(Reader),
        stream.get(Reader, CSVDataResult, !IO),
        (
            CSVDataResult = ok(CSVData),
            io.write_list(CSVData ^ csv_records, "", io.print_line, !IO)
        ;
            CSVDataResult = eof
        ;
            CSVDataResult = error(Error),
            ErrorMsg = stream.error_message(Error),
            io.stderr_stream(Stderr, !IO),
            io.format(Stderr, "error: %s.\n.", [s(ErrorMsg)], !IO),
            io.set_exit_status(1, !IO)
        )
    ;
        InitReaderResult = error(ErrorMsg),
        io.stderr_stream(Stderr, !IO),
        io.format(Stderr, "error: %s.\n", [s(ErrorMsg)], !IO),
        io.set_exit_status(1, !IO)
    ).

:- pred infer_pred(string::in, field_desc::out, io::di, io::uo) is det.

infer_pred(FieldName, FieldDesc, !IO) :-
    ( if string.prefix(FieldName, "INT_") then
        FieldDesc = field_desc(int(do_not_allow_floats, []), no_limit, trim_whitespace)
    else if string.prefix(FieldName, "FLOAT_") then
        FieldDesc = field_desc(float([]), no_limit, trim_whitespace)
    else if string.prefix(FieldName, "STRING_") then
        FieldDesc = field_desc(string([]), no_limit, trim_whitespace)
    else if string.prefix(FieldName, "DATE_") then
        FieldDesc = field_desc(date(yyyy_mm_dd("-"), []), no_limit, trim_whitespace)
    else
        % Otherwise it's not a field we recognise so just discard it.
        FieldDesc = discard(no_limit)
    ).

%-----------------------------------------------------------------------------%
:- end_module infer_desc_ex.
%-----------------------------------------------------------------------------%
-------------- next part --------------
%-----------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%-----------------------------------------------------------------------------%

:- module infer_desc_from_header.
:- interface.

:- import_module csv.

:- import_module io.
:- import_module stream.

%-----------------------------------------------------------------------------%

:- type infer_desc_pred == pred(string, field_desc, io, io).
:- inst infer_desc_pred == (pred(in, out, di, uo) is det).

    % init_with_inferred_desc(Stream, ToFieldDescPred, Result, !IO):
    %
:- pred init_with_inferred_desc(io.text_input_stream::in,
    infer_desc_pred::in(infer_desc_pred),
    stream.res(csv.reader(io.text_input_stream), string)::out,
    io::di, io::uo) is det.

%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%

:- implementation.

:- import_module list.
:- import_module require.

%-----------------------------------------------------------------------------%

init_with_inferred_desc(Stream, ToFieldDescPred, Result, !IO) :-
    % Create a raw reader and use it to read the first line of the file.
    csv.init_raw_reader(Stream, RawReader, !IO),
    stream.get(RawReader, RawRecordResult, !IO),
    (
        RawRecordResult = ok(RawRecord),
        RawRecord = raw_record(_HeaderLineNum, HeaderFields),
        % Apply the user provided closure to determine what the record_desc
        % to use with the reamainder of the stream.
        list.map_foldl(header_field_to_desc(ToFieldDescPred), HeaderFields,
            FieldDescs, !IO),
        % 'no_header' since we've already read it!
        csv.init_reader(Stream, no_header, FieldDescs, Reader, !IO),
        Result = ok(Reader)
    ;
        RawRecordResult = eof,
        Result = error("stream is at EOF")
    ;
        RawRecordResult = error(RawReaderError),
        ErrorMsg = stream.error_message(RawReaderError),
        Result = error(ErrorMsg)
    ).

:- pred header_field_to_desc(infer_desc_pred::in(infer_desc_pred),
    raw_field::in, field_desc::out, io::di, io::uo) is det.

header_field_to_desc(ToFieldDescPred, RawField, FieldDesc, !IO) :-
    RawField = raw_field(RawFieldValue, _, _),
    ToFieldDescPred(RawFieldValue, FieldDesc, !IO).

%-----------------------------------------------------------------------------%
:- end_module infer_desc_from_header.
%-----------------------------------------------------------------------------%


More information about the users mailing list