Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Various fixes to byte / bytearray search" #55733

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ hash(x::Char, h::UInt) =
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))

first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)

# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
Expand Down
136 changes: 47 additions & 89 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,7 @@ match strings with [`match`](@ref).
"""
abstract type AbstractPattern end

# TODO: These unions represent bytes in memory that can be accessed via a pointer.
# this property is used throughout Julia, e.g. also in IO code.
# This deserves a better solution - see #53178.
# If such a better solution comes in place, these unions should be replaced.
const DenseInt8 = Union{
DenseArray{Int8},
FastContiguousSubArray{Int8,N,<:DenseArray} where N
}

# Note: This union is different from that above in that it includes CodeUnits.
# Currently, this is redundant as CodeUnits <: DenseVector, but this subtyping
# is buggy and may be removed in the future, see #54002
const DenseUInt8 = Union{
DenseArray{UInt8},
FastContiguousSubArray{UInt8,N,<:DenseArray} where N,
CodeUnits{UInt8, <:Union{String, SubString{String}}},
FastContiguousSubArray{UInt8,N,<:CodeUnits{UInt8, <:Union{String, SubString{String}}}} where N,
}

const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}

last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
nothing_sentinel(i) = i == 0 ? nothing : i

function last_utf8_byte(c::Char)
u = reinterpret(UInt32, c)
Expand All @@ -52,11 +30,11 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}
end
@inbounds isvalid(s, i) || string_index_err(s, i)
c = pred.x
c ≤ '\x7f' && return _search(s, first_utf8_byte(c), i)
c ≤ '\x7f' && return nothing_sentinel(_search(s, c % UInt8, i))
while true
i = _search(s, first_utf8_byte(c), i)
i === nothing && return nothing
isvalid(s, i) && pred(s[i]) && return i
i == 0 && return nothing
pred(s[i]) && return i
i = nextind(s, i)
end
end
Expand All @@ -69,41 +47,31 @@ const DenseBytes = Union{
CodeUnits{UInt8, <:Union{String, SubString{String}}},
}

function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
findnext(pred, a, firstindex(a))
end
const ByteArray = Union{DenseBytes, DenseArrayType{Int8}}

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
_search(a, pred.x, i)
end
findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
nothing_sentinel(_search(a, pred.x))

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
_search(a, pred.x, i)
end
findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
nothing_sentinel(_search(a, pred.x, i))

# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
findfirst(::typeof(iszero), a::ByteArray) = nothing_sentinel(_search(a, zero(UInt8)))
findnext(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_search(a, zero(UInt8), i))

function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
function _search(a::Union{String,SubString{String},<:ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
if i < 1
throw(BoundsError(a, i))
end
n_bytes = lst - i + 1
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
end
return q == C_NULL ? nothing : (q-p+fst) % Int
p = pointer(a)
q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
return q == C_NULL ? 0 : Int(q-p+1)
end

function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
if isascii(b)
_search(a,UInt8(b),i)
else
Expand All @@ -112,51 +80,41 @@ function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
s::String, i::Integer)
c = pred.x
c ≤ '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
c ≤ '\x7f' && return nothing_sentinel(_rsearch(s, c % UInt8, i))
b = first_utf8_byte(c)
while true
i = _rsearch(s, b, i)
i == nothing && return nothing
isvalid(s, i) && pred(s[i]) && return i
i == 0 && return nothing
pred(s[i]) && return i
i = prevind(s, i)
end
end

function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
findprev(pred, a, lastindex(a))
end
findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
nothing_sentinel(_rsearch(a, pred.x))

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
_rsearch(a, pred.x, i)
end
findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
nothing_sentinel(_rsearch(a, pred.x, i))

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
_rsearch(a, pred.x, i)
end
findlast(::typeof(iszero), a::ByteArray) = nothing_sentinel(_rsearch(a, zero(UInt8)))
findprev(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, zero(UInt8), i))

# See comments above for findfirst(::typeof(iszero)) methods
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)

function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
return i == fst - 1 ? nothing : throw(BoundsError(a, i))
end
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a))
if i < 1
return i == 0 ? 0 : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
return q == C_NULL ? nothing : (q-p+fst) % Int
p = pointer(a)
q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
return q == C_NULL ? 0 : Int(q-p+1)
end

function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
if isascii(b)
_rsearch(a,UInt8(b),i)
else
Expand Down Expand Up @@ -266,19 +224,18 @@ end

in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)

function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},
function _searchindex(s::Union{AbstractString,ByteArray},
t::Union{AbstractString,AbstractChar,Int8,UInt8},
i::Integer)
sentinel = firstindex(s) - 1
x = Iterators.peel(t)
if isnothing(x)
return firstindex(s) <= i <= nextind(s,lastindex(s))::Int ? i :
return 1 <= i <= nextind(s,lastindex(s))::Int ? i :
throw(BoundsError(s, i))
end
t1, trest = x
while true
i = findnext(isequal(t1),s,i)
if i === nothing return sentinel end
if i === nothing return 0 end
ii = nextind(s, i)::Int
a = Iterators.Stateful(trest)
matched = all(splat(==), zip(SubString(s, ii), a))
Expand Down Expand Up @@ -552,8 +509,9 @@ julia> findall(UInt8[1,2], UInt8[1,2,3,1,2])
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
"""
function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},
s::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},

function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
; overlap::Bool=false)
found = UnitRange{Int}[]
i, e = firstindex(s), lastindex(s)
Expand Down Expand Up @@ -606,7 +564,7 @@ function _rsearchindex(s::AbstractString,
end
end

function _rsearchindex(s::Union{String, SubString{String}}, t::Union{String, SubString{String}}, i::Integer)
function _rsearchindex(s::String, t::String, i::Integer)
# Check for fast case of a single byte
if lastindex(t) == 1
return something(findprev(isequal(t[1]), s, i), 0)
Expand Down
49 changes: 0 additions & 49 deletions test/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,6 @@ for str in [u8str]
@test findprev(isequal('ε'), str, 4) === nothing
end

# See the comments in #54579
@testset "Search for invalid chars" begin
@test findfirst(==('\xff'), "abc\xffde") == 4
@test findprev(isequal('\xa6'), "abc\xa69", 5) == 4
@test isnothing(findfirst(==('\xff'), "abcdeæd"))

@test isnothing(findnext(==('\xa6'), "æ", 1))
@test isnothing(findprev(==('\xa6'), "æa", 2))
end

# string forward search with a single-char string
@test findfirst("x", astr) === nothing
@test findfirst("H", astr) == 1:1
Expand Down Expand Up @@ -455,45 +445,6 @@ end
@test_throws BoundsError findprev(pattern, A, -3)
end
end

@test findall([0x01, 0x02], [0x03, 0x01, 0x02, 0x01, 0x02, 0x06]) == [2:3, 4:5]
@test isempty(findall([0x04, 0x05], [0x03, 0x04, 0x06]))
end

# Issue 54578
@testset "No conflation of Int8 and UInt8" begin
# Work for mixed types if the values are the same
@test findfirst(==(Int8(1)), [0x01]) == 1
@test findnext(iszero, Int8[0, -2, 0, -3], 2) == 3
@test findfirst(Int8[1,4], UInt8[0, 2, 4, 1, 8, 1, 4, 2]) == 6:7
@test findprev(UInt8[5, 6], Int8[1, 9, 2, 5, 6, 3], 6) == 4:5

# Returns nothing for the same methods if the values are different,
# even if the bitpatterns are the same
@test isnothing(findfirst(==(Int8(-1)), [0xff]))
@test isnothing(findnext(isequal(0xff), Int8[-1, -2, -1], 2))
@test isnothing(findfirst(UInt8[0xff, 0xfe], Int8[0, -1, -2, 1, 8, 1, 4, 2]))
@test isnothing(findprev(UInt8[0xff, 0xfe], Int8[1, 9, 2, -1, -2, 3], 6))
end

@testset "DenseArray with offsets" begin
isdefined(Main, :OffsetDenseArrays) || @eval Main include("../testhelpers/OffsetDenseArrays.jl")
OffsetDenseArrays = Main.OffsetDenseArrays

A = OffsetDenseArrays.OffsetDenseArray(collect(0x61:0x69), 100)
@test findfirst(==(0x61), A) == 101
@test findlast(==(0x61), A) == 101
@test findfirst(==(0x00), A) === nothing

@test findfirst([0x62, 0x63, 0x64], A) == 102:104
@test findlast([0x63, 0x64], A) == 103:104
@test findall([0x62, 0x63], A) == [102:103]

@test findfirst(iszero, A) === nothing
A = OffsetDenseArrays.OffsetDenseArray([0x01, 0x02, 0x00, 0x03], -100)
@test findfirst(iszero, A) == -97
@test findnext(==(0x02), A, -99) == -98
@test findnext(==(0x02), A, -97) === nothing
end

# issue 32568
Expand Down
31 changes: 0 additions & 31 deletions test/testhelpers/OffsetDenseArrays.jl

This file was deleted.