-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
RFC: Make EOL handling more general #19877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
81d7c38
a04d81b
fb30c92
80c94b8
e3cfa28
cffdf0d
603baae
7e9cedd
bb7475e
3538aff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,24 +167,37 @@ The text is assumed to be encoded in UTF-8. | |
readuntil(filename::AbstractString, args...) = open(io->readuntil(io, args...), filename) | ||
|
||
""" | ||
readline(stream::IO=STDIN) | ||
readline(filename::AbstractString) | ||
readline(stream::IO=STDIN, chomp::Bool=false) | ||
readline(filename::AbstractString, chomp::Bool=false) | ||
|
||
Read a single line of text including from the given I/O stream or file (defaults to `STDIN`). | ||
Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. When reading from a file, the text is | ||
assumed to be encoded in UTF-8. If `chomp=false` trailing newline character(s) will be included | ||
in the output (if reached before the end of the input); otherwise newline characters(s) | ||
are stripped from result. | ||
""" | ||
function readline(filename::AbstractString, chomp = false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still need to mention |
||
open(filename) do f | ||
readline(f, chomp) | ||
end | ||
end | ||
|
||
Read a single line of text, including a trailing newline character (if one is reached before | ||
the end of the input), from the given I/O stream or file (defaults to `STDIN`). | ||
When reading from a file, the text is assumed to be encoded in UTF-8. | ||
""" | ||
readline(filename::AbstractString) = open(readline, filename) | ||
|
||
""" | ||
readlines(stream::IO) | ||
readlines(filename::AbstractString) | ||
readlines(stream::IO, chomp::Bool=false) | ||
readlines(filename::AbstractString, chomp::Bool=false) | ||
|
||
Read all lines of an I/O stream or a file as a vector of strings. | ||
The text is assumed to be encoded in UTF-8. | ||
""" | ||
readlines(filename::AbstractString) = open(readlines, filename) | ||
|
||
Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. | ||
The text is assumed to be encoded in UTF-8. If `chomp=false` | ||
trailing newline character(s) will be included in the output; | ||
otherwise newline characters(s) are stripped from result. | ||
""" | ||
function readlines(filename::AbstractString, chomp::Bool=false) | ||
open(filename) do f | ||
readlines(f, chomp) | ||
end | ||
end | ||
|
||
## byte-order mark, ntoh & hton ## | ||
|
||
|
@@ -448,7 +461,29 @@ function readuntil(s::IO, t::AbstractString) | |
end | ||
|
||
readline() = readline(STDIN) | ||
readline(s::IO) = readuntil(s, '\n') | ||
|
||
function readline(s::IO, chomp::Bool=false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is no longer needed now, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's still needed. C version reads from IOStreams, but not for instance IOBuffers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, so this means it has to be relatively efficient too. Is that the case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd say its relatively efficient and anyway it has the same performance as before. |
||
out = UInt8[] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is significantly faster, readuntil uses the same approach. |
||
while !eof(s) | ||
c = read(s, UInt8) | ||
if c == 0x0d | ||
!chomp && push!(out, c) | ||
if !eof(s) && Base.peek(s) == 0x0a | ||
c = read(s, UInt8) | ||
!chomp && push!(out, c) | ||
end | ||
|
||
break | ||
elseif c == 0x0a | ||
!chomp && push!(out, c) | ||
break | ||
else | ||
push!(out, c) | ||
end | ||
end | ||
|
||
return String(out) | ||
end | ||
|
||
""" | ||
readchomp(x) | ||
|
@@ -512,22 +547,28 @@ readstring(filename::AbstractString) = open(readstring, filename) | |
|
||
type EachLine | ||
stream::IO | ||
chomp::Bool | ||
ondone::Function | ||
EachLine(stream) = EachLine(stream, ()->nothing) | ||
EachLine(stream, ondone) = new(stream, ondone) | ||
EachLine(stream, chomp) = EachLine(stream, chomp, ()->nothing) | ||
EachLine(stream, chomp, ondone) = new(stream, chomp, ondone) | ||
end | ||
|
||
""" | ||
eachline(stream::IO) | ||
eachline(filename::AbstractString) | ||
eachline(stream::IO, chomp::Bool=false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One space is enough. |
||
eachline(filename::AbstractString, chomp::Bool=false) | ||
|
||
Create an iterable object that will yield each line from an I/O stream or a file. | ||
The text is assumed to be encoded in UTF-8. | ||
Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. | ||
The text is assumed to be encoded in UTF-8. If `chomp=false` | ||
trailing newline character(s) will be included in the output; | ||
otherwise newline characters(s) are stripped from result. | ||
""" | ||
eachline(stream::IO) = EachLine(stream) | ||
function eachline(filename::AbstractString) | ||
eachline(stream::IO, chomp::Bool=false) = EachLine(stream, chomp) | ||
|
||
|
||
function eachline(filename::AbstractString, chomp::Bool=false) | ||
s = open(filename) | ||
EachLine(s, ()->close(s)) | ||
EachLine(s, chomp, ()->close(s)) | ||
end | ||
|
||
start(itr::EachLine) = nothing | ||
|
@@ -538,10 +579,11 @@ function done(itr::EachLine, nada) | |
itr.ondone() | ||
true | ||
end | ||
next(itr::EachLine, nada) = (readline(itr.stream), nothing) | ||
|
||
next(itr::EachLine, nada) = (readline(itr.stream, itr.chomp), nothing) | ||
eltype(::Type{EachLine}) = String | ||
|
||
readlines(s=STDIN) = collect(eachline(s)) | ||
readlines(s::IO=STDIN, chomp::Bool=false) = collect(eachline(s, chomp)) | ||
|
||
iteratorsize(::Type{EachLine}) = SizeUnknown() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -77,18 +77,23 @@ chop(s::AbstractString) = SubString(s, 1, endof(s)-1) | |
chomp(s::AbstractString) | ||
|
||
Remove a single trailing newline from a string. | ||
Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. | ||
""" | ||
function chomp(s::AbstractString) | ||
i = endof(s) | ||
if (i < 1 || s[i] != '\n') return SubString(s, 1, i) end | ||
if (i < 1 || (s[i] != '\n' && s[i] != '\r')) return SubString(s, 1, i) end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Double space here. |
||
if (s[i] == '\r') return SubString(s, 1, i-1) end | ||
j = prevind(s,i) | ||
if (j < 1 || s[j] != '\r') return SubString(s, 1, i-1) end | ||
if (j < 1 || (s[j] != '\r' && s[i] == '\n')) return SubString(s, 1, i-1) end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need to check that the |
||
return SubString(s, 1, j-1) | ||
end | ||
|
||
function chomp(s::String) | ||
i = endof(s) | ||
if i < 1 || s.data[i] != 0x0a | ||
if i < 1 || (s.data[i] != 0x0a && s.data[i] != 0x0d) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you're making the function more complex, I would find it clearer to replace the integer codes with |
||
SubString(s, 1, i) | ||
elseif s.data[i] == 0x0d | ||
SubString(s, 1, i-1) | ||
elseif i < 2 || s.data[i-1] != 0x0d | ||
SubString(s, 1, i-1) | ||
else | ||
|
@@ -101,6 +106,8 @@ function chomp!(s::String) | |
if !isempty(s) && s.data[end] == 0x0a | ||
n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 | ||
ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) | ||
elseif s.data[end] == 0x0d | ||
ccall(:jl_array_del_end, Void, (Any, UInt), s.data, 1) | ||
end | ||
return s | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -831,6 +831,66 @@ size_t ios_copyuntil(ios_t *to, ios_t *from, char delim) | |
return total; | ||
} | ||
|
||
// Copy until '\r', '\n' or '\r\n' | ||
size_t ios_copyline(ios_t *to, ios_t *from, int chomp) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Existing code uses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, it's just that I found a |
||
{ | ||
size_t nchomp = 0; | ||
size_t total = 0, avail = (size_t)(from->size - from->bpos); | ||
size_t ntowrite; | ||
while (!ios_eof(from)) { | ||
if (avail == 0) { | ||
avail = ios_readprep(from, LINE_CHUNK_SIZE); | ||
if (avail == 0) | ||
break; | ||
} | ||
size_t written; | ||
|
||
char *r = NULL; | ||
char *n = NULL; | ||
|
||
for (size_t i = 0; i < avail; i++){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't it be simpler and possibly more efficient to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's the first thing I tried and it's simpler and significantly slower. You have to look for both This has the same performance as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't follow. What difference does it make from the manual loop? I just suggest breaking on the first match. Then you only have to call
Even, for relatively long strings? I'm asking because it looks like glibc contains hand-written assembly code here and explicit SSE instructions here for this, so I figure there must be a reason. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually I tried using two calls to Most of the time in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Anyway I've just realized |
||
char *p = (char*)from->buf+from->bpos+i; | ||
char ch = from->buf[from->bpos+i]; | ||
|
||
if (ch == '\n'){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space before |
||
n = p; | ||
if (chomp) nchomp = 1; | ||
ntowrite = n - (from->buf+from->bpos) + 1 - nchomp; | ||
break; | ||
} | ||
if (ch == '\r'){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing |
||
r = p; | ||
if (chomp) nchomp = 1; | ||
ntowrite = r - (from->buf+from->bpos) + 1 - nchomp; | ||
if (i < avail){ | ||
char ch2 = from->buf[from->bpos+i+1]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can combine the two |
||
if (ch2 == '\n'){ | ||
if (chomp) nchomp = 2; | ||
ntowrite = r - (from->buf+from->bpos) + 2 - nchomp; | ||
} | ||
} | ||
break; | ||
} | ||
} | ||
|
||
if (r == NULL && n == NULL) { | ||
written = ios_write(to, from->buf+from->bpos, avail); | ||
from->bpos += avail; | ||
total += written; | ||
avail = 0; | ||
} | ||
else { | ||
written = ios_write(to, from->buf+from->bpos, ntowrite); | ||
from->bpos += ntowrite + nchomp; | ||
total += written; | ||
return total; | ||
} | ||
} | ||
from->_eof = 1; | ||
return total; | ||
} | ||
|
||
|
||
static void _ios_init(ios_t *s) | ||
{ | ||
// put all fields in a sane initial state | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,6 +74,20 @@ Base.compact(io) | |
@test_throws ArgumentError seek(io,0) | ||
@test_throws ArgumentError truncate(io,0) | ||
@test readline(io) == "whipped cream\n" | ||
@test write(io,"pancakes\r\nwaffles\n\rblueberries\r") > 0 | ||
@test readline(io) == "pancakes\r\n" | ||
@test readline(io) == "waffles\n" | ||
@test readline(io) == "\r" | ||
@test readline(io) == "blueberries\r" | ||
write(io,"pancakes\r\nwaffles\n\rblueberries\r") | ||
@test readline(io, true) == "pancakes" | ||
@test readline(io, true) == "waffles" | ||
@test readline(io, true) == "" | ||
@test readline(io, true) == "blueberries" | ||
write(io,"pancakes\r\nwaffles\n\rblueberries\r") | ||
@test readlines(io) == String["pancakes\r\n","waffles\n","\r","blueberries\r"] | ||
write(io,"pancakes\r\nwaffles\n\rblueberries\r") | ||
@test readlines(io, true) == String["pancakes","waffles","","blueberries"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also add tests for potentially problematic cases like |
||
Base.compact(io) | ||
@test position(io) == 0 | ||
@test ioslength(io) == 0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove "including" (typo?).