[m-users.] Read csv file with variable number and type of fields
Julien Fischer
jfischer at opturion.com
Wed Sep 2 14:44:36 AEST 2015
Hi Dirk,
On Tue, 1 Sep 2015, Dirk Ziegemeyer wrote:
> I need to read csv files where the number and the type of fields in a
> record is not fixed in advance. Instead the record structure is
> determined by the header record of the csv file together with a
> database which assigns a data type to every header field name.
>
> I’m wondering if I can use the library https://github.com/juliensf/mercury-csv for that.
>
> It seems that I need to know the record structure in order to
> initialize a csv reader with csv.init_reader/3.
Yes, but there's nothing that prevents you from reading in the header
line, setting up the record structure base on that and then
initializing a new CSV reader based on that.
> My idea is to read the file in two steps:
> 1. the header line (in order to determine number and type of fields), e.g. with some DCG rules or module parsing_utils
> 2. the rest
>
> Is this a valid approach and can I combine io.read_line_as_string/3 to
> read the header line and stream.get/4 to read the rest of the file?
Yes, it would work. It's going to be a bit fiddly as you are going to
have to handle things like quoted header names etc yourself. A better
approach would be to use the CSV library's raw_reader to read in just
the header line, set up the record structure based on that, and then
intialize a new reader to handle the rest of the data.
I've attached a small example of how this could be done.
Julien.
-------------- next part --------------
%-----------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%-----------------------------------------------------------------------------%
%
% Read CSV data from the standard input. Use the header fields to determine
% the structure of the rest of the file.
%
% Fields whose header has the prefix "INT_" are treated as integer
% fields.
% Fields whose header has the prefix "FLOAT_" are treated as floats fields.
% Fields whose header has the prefix "STRING_" are treated as strings.
% Fields whose header has the prefix "DATE_" are treated as dates in
% YYYY-MM-DD format.
% Field whose header does not match one of the above are discarded.
%
%-----------------------------------------------------------------------------%
:- module infer_desc_ex.
:- interface.
:- import_module io.
:- pred main(io::di, io::uo) is det.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
:- import_module csv.
:- import_module infer_desc_from_header.
:- import_module list.
:- import_module stream.
:- import_module string.
%-----------------------------------------------------------------------------%
main(!IO) :-
io.stdin_stream(Stdin, !IO),
init_with_inferred_desc(Stdin, infer_pred, InitReaderResult, !IO),
(
InitReaderResult = ok(Reader),
stream.get(Reader, CSVDataResult, !IO),
(
CSVDataResult = ok(CSVData),
io.write_list(CSVData ^ csv_records, "", io.print_line, !IO)
;
CSVDataResult = eof
;
CSVDataResult = error(Error),
ErrorMsg = stream.error_message(Error),
io.stderr_stream(Stderr, !IO),
io.format(Stderr, "error: %s.\n.", [s(ErrorMsg)], !IO),
io.set_exit_status(1, !IO)
)
;
InitReaderResult = error(ErrorMsg),
io.stderr_stream(Stderr, !IO),
io.format(Stderr, "error: %s.\n", [s(ErrorMsg)], !IO),
io.set_exit_status(1, !IO)
).
:- pred infer_pred(string::in, field_desc::out, io::di, io::uo) is det.
infer_pred(FieldName, FieldDesc, !IO) :-
( if string.prefix(FieldName, "INT_") then
FieldDesc = field_desc(int(do_not_allow_floats, []), no_limit, trim_whitespace)
else if string.prefix(FieldName, "FLOAT_") then
FieldDesc = field_desc(float([]), no_limit, trim_whitespace)
else if string.prefix(FieldName, "STRING_") then
FieldDesc = field_desc(string([]), no_limit, trim_whitespace)
else if string.prefix(FieldName, "DATE_") then
FieldDesc = field_desc(date(yyyy_mm_dd("-"), []), no_limit, trim_whitespace)
else
% Otherwise it's not a field we recognise so just discard it.
FieldDesc = discard(no_limit)
).
%-----------------------------------------------------------------------------%
:- end_module infer_desc_ex.
%-----------------------------------------------------------------------------%
-------------- next part --------------
%-----------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%-----------------------------------------------------------------------------%
:- module infer_desc_from_header.
:- interface.
:- import_module csv.
:- import_module io.
:- import_module stream.
%-----------------------------------------------------------------------------%
:- type infer_desc_pred == pred(string, field_desc, io, io).
:- inst infer_desc_pred == (pred(in, out, di, uo) is det).
% init_with_inferred_desc(Stream, ToFieldDescPred, Result, !IO):
%
:- pred init_with_inferred_desc(io.text_input_stream::in,
infer_desc_pred::in(infer_desc_pred),
stream.res(csv.reader(io.text_input_stream), string)::out,
io::di, io::uo) is det.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
:- import_module list.
:- import_module require.
%-----------------------------------------------------------------------------%
init_with_inferred_desc(Stream, ToFieldDescPred, Result, !IO) :-
% Create a raw reader and use it to read the first line of the file.
csv.init_raw_reader(Stream, RawReader, !IO),
stream.get(RawReader, RawRecordResult, !IO),
(
RawRecordResult = ok(RawRecord),
RawRecord = raw_record(_HeaderLineNum, HeaderFields),
% Apply the user provided closure to determine what the record_desc
% to use with the reamainder of the stream.
list.map_foldl(header_field_to_desc(ToFieldDescPred), HeaderFields,
FieldDescs, !IO),
% 'no_header' since we've already read it!
csv.init_reader(Stream, no_header, FieldDescs, Reader, !IO),
Result = ok(Reader)
;
RawRecordResult = eof,
Result = error("stream is at EOF")
;
RawRecordResult = error(RawReaderError),
ErrorMsg = stream.error_message(RawReaderError),
Result = error(ErrorMsg)
).
:- pred header_field_to_desc(infer_desc_pred::in(infer_desc_pred),
raw_field::in, field_desc::out, io::di, io::uo) is det.
header_field_to_desc(ToFieldDescPred, RawField, FieldDesc, !IO) :-
RawField = raw_field(RawFieldValue, _, _),
ToFieldDescPred(RawFieldValue, FieldDesc, !IO).
%-----------------------------------------------------------------------------%
:- end_module infer_desc_from_header.
%-----------------------------------------------------------------------------%
More information about the users
mailing list