Skip to content

Commit

Permalink
input/csv: improve reliabilty of text line isolation
Browse files Browse the repository at this point in the history
Slightly unobfuscate the "end of current input chunk" marker in the data
processing loop. Make the variable's identifier reflect that it's not a
temporary, but instead something worth keeping around until needed again.

Unbreak the calculation of line numbers in those situations where input
chunks (including previously accumulated unprocessed data) happens to
start with a line termination. This covers input files which start with
empty lines, as well as environments with mutli-byte line termination
sequences (CR/LF) and arbitrary distribution of bytes across chunks.

This fixes bug #968.

Accept when there is no line termination in the current input chunk. We
cannot assume that calling applications always provide file content in
large enough chunks to span complete lines. And any arbitrary chunk size
which applications happen to use can get exceeded by input files (e.g.
for generated files with wide data or long comments).
  • Loading branch information
gsigh committed Dec 21, 2019
1 parent cb3b805 commit fbefa03
Showing 1 changed file with 11 additions and 9 deletions.
20 changes: 11 additions & 9 deletions src/input/csv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,8 @@ static int process_buffer(struct sr_input *in, gboolean is_eof)
const struct column_details *details;
col_parse_cb parse_func;
int ret;
char *p, **lines, *line, **columns, *column;
char *processed_up_to;
char **lines, *line, **columns, *column;

inc = in->priv;
if (!inc->started) {
Expand All @@ -1526,16 +1527,17 @@ static int process_buffer(struct sr_input *in, gboolean is_eof)
if (!in->buf->len)
return SR_OK;
if (is_eof) {
p = in->buf->str + in->buf->len;
processed_up_to = in->buf->str + in->buf->len;
} else {
p = g_strrstr_len(in->buf->str, in->buf->len, inc->termination);
if (!p)
return SR_ERR;
*p = '\0';
p += strlen(inc->termination);
processed_up_to = g_strrstr_len(in->buf->str, in->buf->len,
inc->termination);
if (!processed_up_to)
return SR_OK;
*processed_up_to = '\0';
processed_up_to += strlen(inc->termination);
}
g_strstrip(in->buf->str);

/* Split input text lines and process their columns. */
ret = SR_OK;
lines = g_strsplit(in->buf->str, inc->termination, 0);
for (line_idx = 0; (line = lines[line_idx]); line_idx++) {
Expand Down Expand Up @@ -1612,7 +1614,7 @@ static int process_buffer(struct sr_input *in, gboolean is_eof)
g_strfreev(columns);
}
g_strfreev(lines);
g_string_erase(in->buf, 0, p - in->buf->str);
g_string_erase(in->buf, 0, processed_up_to - in->buf->str);

return ret;
}
Expand Down

0 comments on commit fbefa03

Please sign in to comment.