Skip to content

Commit

Permalink
Various fixes to searching (squashed JuliaLang#54579)
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobnissen committed Jan 13, 2025
1 parent 3b629f1 commit 1b6c200
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 19 deletions.
41 changes: 22 additions & 19 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ const DenseUInt8 = Union{

const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}

last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
nothing_sentinel(i) = i == 0 ? nothing : i

function last_utf8_byte(c::Char)
u = reinterpret(UInt32, c)
Expand All @@ -44,6 +43,9 @@ end
# This holds even in the presence of invalid UTF8
is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)

last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
if i < 1 || i > sizeof(s)
Expand All @@ -52,10 +54,10 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}
end
@inbounds isvalid(s, i) || string_index_err(s, i)
c = pred.x
c '\x7f' && return _search(s, first_utf8_byte(c), i)
c '\x7f' && return nothing_sentinel(_search(s, first_utf8_byte(c), i))
while true
i = _search(s, first_utf8_byte(c), i)
i === nothing && return nothing
i == 0 && return nothing
isvalid(s, i) && pred(s[i]) && return i
i = nextind(s, i)
end
Expand All @@ -66,17 +68,17 @@ function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8,
end

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
_search(a, pred.x, i)
nothing_sentinel(_search(a, pred.x, i))
end

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
_search(a, pred.x, i)
nothing_sentinel(_search(a, pred.x, i))
end

# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = nothing_sentinel(_search(a, zero(UInt8)))
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = nothing_sentinel(_search(a, zero(UInt8), i))

function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
fst = firstindex(a)
Expand All @@ -86,13 +88,13 @@ function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{I
end
n_bytes = lst - i + 1
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
return i == lst+1 ? 0 : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
end
return q == C_NULL ? nothing : (q-p+fst) % Int
return q == C_NULL ? 0 : (q-p+fst) % Int
end

function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
Expand All @@ -106,11 +108,11 @@ end
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
c = pred.x
c '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
c '\x7f' && return nothing_sentinel(_rsearch(s, first_utf8_byte(c), i))
b = first_utf8_byte(c)
while true
i = _rsearch(s, b, i)
i == nothing && return nothing
i == 0 && return nothing
isvalid(s, i) && pred(s[i]) && return i
i = prevind(s, i)
end
Expand All @@ -121,31 +123,32 @@ function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UI
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
_rsearch(a, pred.x, i)
nothing_sentinel(_rsearch(a, pred.x, i))
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
_rsearch(a, pred.x, i)
nothing_sentinel(_rsearch(a, pred.x, i))
end

# See comments above for findfirst(::typeof(iszero)) methods
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = nothing_sentinel(_rsearch(a, zero(UInt8)))
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = nothing_sentinel(_rsearch(a, zero(UInt8), i))

function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
return i == fst - 1 ? nothing : throw(BoundsError(a, i))
return i == 0 ? 0 : throw(BoundsError(a, i))
end
n_bytes = lst - i + 1
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
return i == lst+1 ? 0 : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
end
return q == C_NULL ? nothing : (q-p+fst) % Int
return q == C_NULL ? 0 : (q-p+fst) % Int
end

function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
Expand Down
95 changes: 95 additions & 0 deletions test/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,101 @@ end
@test findnext(",b", "foo,bar,baz", 10) === nothing
@test findfirst("az", "foo,bar,baz") == 10:11
@test findnext("az", "foo,bar,baz", 12) === nothing
# See the comments in #54579
@testset "Search for invalid chars" begin
@test findfirst(==('\xff'), "abc\xffde") == 4
@test findprev(isequal('\xa6'), "abc\xa69", 5) == 4
@test isnothing(findfirst(==('\xff'), "abcdeæd"))

@test isnothing(findnext(==('\xa6'), "æ", 1))
@test isnothing(findprev(==('\xa6'), "æa", 2))
end

# string forward search with a single-char string
@test findfirst("x", astr) === nothing
@test findfirst("H", astr) == 1:1
@test findnext("H", astr, 2) === nothing
@test findfirst("l", astr) == 3:3
@test findnext("l", astr, 4) == 4:4
@test findnext("l", astr, 5) == 11:11
@test findnext("l", astr, 12) === nothing
@test findfirst("\n", astr) == 14:14
@test findnext("\n", astr, 15) === nothing

@test findfirst("z", u8str) === nothing
@test findfirst("", u8str) === nothing
@test findfirst("", u8str) == 1:1
@test findnext("", u8str, 4) === nothing
@test findfirst("", u8str) == 13:13
@test findnext("", u8str, 16) === nothing
@test findfirst("x", u8str) == 26:26
@test findnext("x", u8str, 27) == 43:43
@test findnext("x", u8str, 44) === nothing
@test findfirst("ε", u8str) == 5:5
@test findnext("ε", u8str, 7) == 54:54
@test findnext("ε", u8str, 56) === nothing

# strifindprev backward search with a single-char string
@test findlast("x", astr) === nothing
@test findlast("H", astr) == 1:1
@test findprev("H", astr, 2) == 1:1
@test findprev("H", astr, 0) === nothing
@test findlast("l", astr) == 11:11
@test findprev("l", astr, 10) == 4:4
@test findprev("l", astr, 4) == 4:4
@test findprev("l", astr, 3) == 3:3
@test findprev("l", astr, 2) === nothing
@test findlast("\n", astr) == 14:14
@test findprev("\n", astr, 13) === nothing

@test findlast("z", u8str) === nothing
@test findlast("", u8str) === nothing
@test findlast("", u8str) == 1:1
@test findprev("", u8str, 0) === nothing
#TODO: setting the limit in the middle of a wide char
# makes findnext fail but findprev succeed.
# Should findprev fail as well?
#@test findprev("∀", u8str, 2) === nothing # gives 1:3
@test findlast("", u8str) == 13:13
@test findprev("", u8str, 12) === nothing
@test findlast("x", u8str) == 43:43
@test findprev("x", u8str, 42) == 26:26
@test findprev("x", u8str, 25) === nothing
@test findlast("ε", u8str) == 54:54
@test findprev("ε", u8str, 53) == 5:5
@test findprev("ε", u8str, 4) === nothing

# string forward search with a single-char regex
@test findfirst(r"x", astr) === nothing
@test findfirst(r"H", astr) == 1:1
@test findnext(r"H", astr, 2) === nothing
@test findfirst(r"l", astr) == 3:3
@test findnext(r"l", astr, 4) == 4:4
@test findnext(r"l", astr, 5) == 11:11
@test findnext(r"l", astr, 12) === nothing
@test findfirst(r"\n", astr) == 14:14
@test findnext(r"\n", astr, 15) === nothing
@test findfirst(r"z", u8str) === nothing
@test findfirst(r"", u8str) === nothing
@test findfirst(r"", u8str) == 1:1
@test findnext(r"", u8str, 4) === nothing
@test findfirst(r"", u8str) == findfirst(r"\u2200", u8str)
@test findnext(r"", u8str, 4) == findnext(r"\u2200", u8str, 4)
@test findfirst(r"", u8str) == 13:13
@test findnext(r"", u8str, 16) === nothing
@test findfirst(r"x", u8str) == 26:26
@test findnext(r"x", u8str, 27) == 43:43
@test findnext(r"x", u8str, 44) === nothing
@test findfirst(r"ε", u8str) == 5:5
@test findnext(r"ε", u8str, 7) == 54:54
@test findnext(r"ε", u8str, 56) === nothing
for i = 1:lastindex(astr)
@test findnext(r"."s, astr, i) == i:i
end
for i = 1:lastindex(u8str)
if isvalid(u8str,i)
@test findnext(r"."s, u8str, i) == i:i
end
end

@testset "issue #9365" begin
Expand Down

0 comments on commit 1b6c200

Please sign in to comment.