Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimised findall(isequal(::Char), ::String) #54593

Merged
merged 5 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ abstract type AbstractPattern end

nothing_sentinel(i) = i == 0 ? nothing : i

function last_utf8_byte(c::Char)
u = reinterpret(UInt32, c)
shift = ((4 - ncodeunits(c)) * 8) & 31
(u >> shift) % UInt8
end

# Whether the given byte is guaranteed to be the only byte in a Char
# This holds even in the presence of invalid UTF8
is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
if i < 1 || i > sizeof(s)
Expand Down Expand Up @@ -102,6 +112,35 @@ function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
end
end

function findall(
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}
)
c = Char(pred.x)::Char
byte = last_utf8_byte(c)
ncu = ncodeunits(c)

# If only one byte, and can't be part of another Char: Forward to memchr.
is_standalone_byte(byte) && return findall(==(byte), codeunits(s))
result = Int[]
i = firstindex(s)
while true
i = _search(s, byte, i)
iszero(i) && return result
i += 1
index = i - ncu
# If the char is invalid, it's possible that its first byte is
# inside another char. If so, indexing into the string will throw an
# error, so we need to check for valid indices.
isvalid(s, index) || continue
# We use iterate here instead of indexing, because indexing wastefully
# checks for valid index. It would be better if there was something like
# try_getindex(::String, ::Int) we could use.
char = first(something(iterate(s, index)))
pred(char) && push!(result, index)
end
end

"""
findfirst(pattern::AbstractString, string::AbstractString)
findfirst(pattern::AbstractPattern, string::String)
Expand Down
16 changes: 16 additions & 0 deletions test/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,22 @@ s_18109 = "fooα🐨βcd3"
@test findall("aa", "aaaaaa", overlap=true) == [1:2, 2:3, 3:4, 4:5, 5:6]
end

@testset "Findall char in string" begin
@test findall(==('w'), "wabcwewwawk") == [1, 5, 7, 8, 10]
@test isempty(findall(isequal("w"), "abcde!,"))
@test findall(==('读'), "联国读大会一九四二月十读日第号决通过并颁布读") == [7, 34, 64]

# Empty string
@test isempty(findall(isequal('K'), ""))
@test isempty(findall(isequal('α'), ""))

# Finds an invalid char ONLY if it's at a char boundary in the string,
# i.e. iterating the string would emit the given char.
@test findall(==('\xfe'), "abκæøc\xfeα\xfeβå!") == [10, 13]
@test isempty(findall(==('\xaf'), "abκæ读α\xe8\xaf\xfeβå!"))
@test isempty(findall(==('\xc3'), ";æ"))
end

# issue 37280
@testset "UInt8, Int8 vector" begin
for T in [Int8, UInt8], VT in [Int8, UInt8]
Expand Down
Loading