Skip to content

Commit

Permalink
update utf8
Browse files Browse the repository at this point in the history
add Extend Move feature
SSgumS committed Jul 19, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent a5b0a93 commit ebdfd5f
Showing 13 changed files with 317 additions and 155 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -22,6 +22,8 @@ Mentioned directories are at the locations bellow:
# Scripts
## PakNevis
Correct common mistakes in Persian text.
## Extend Move
Extend \move based on line's time (Created for linear signs that go outside of video boundries).
## Unretard
Converts non-RTL typed text to RTL compatible one.
## RTL / RTL

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions include/AL/utf8/README.md
Original file line number Diff line number Diff line change
@@ -36,10 +36,15 @@ utf8.raw.gsub(str, "ло+", "보라")

It also provides all functions from Lua 5.3 UTF-8 [module](https://www.lua.org/manual/5.3/manual.html#6.5) except `utf8.len (s [, i [, j]])`. If you need to validate your strings use `utf8.validate(str, byte_pos)` or iterate over with `utf8.validator`.

Please note that library assumes regexes are valid UTF-8 strings, if you need to manipulate individual bytes use vanilla functions under `utf8.raw`.


#### Installation:

Download repository to your project folder. (no rockspecs yet)

Examples assume library placed under `utf8` subfolder not `utf8.lua`.

As of Lua 5.3 default `utf8` module has precedence over user-provided. In this case you can specify full module path (`.utf8`).

#### Configuration:
@@ -57,6 +62,32 @@ utf8.config = {
}
utf8:init()
```

For `lower` and `upper` functions to work in environments where `ffi` cannot be used, you can specify substitution tables ([data example](https://github.com/artemshein/luv/blob/master/utf8data.lua))

```Lua
local utf8 = require('.utf8')
utf8.config = {
conversion = {
uc_lc = utf8_uc_lc,
lc_uc = utf8_lc_uc
},
}
utf8:init()
```
Customization is done before initialization. If you want, you can change configuration after `init`, it might work for everything but modules. All of them should be reloaded.

#### [Documentation:](test/test.lua)

#### Issue reporting:

Please provide example script that causes error together with environment description and debug output. Debug output can be obtained like:
```Lua
local utf8 = require('.utf8')
utf8.config = {
debug = utf8:require("util").debug
}
utf8:init()
-- your code
```
Default logger used is [`io.write`](https://www.lua.org/manual/5.3/manual.html#pdf-io.write) and can be changed by specifying `logger = my_logger` in configuration
1 change: 1 addition & 0 deletions include/AL/utf8/functions/lua53.lua
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ local utf8gensub = utf8.gensub
local unpack = utf8.config.unpack
local generate_matcher_function = utf8:require 'regex_parser'

local
function get_matcher_function(regex, plain)
local res
if utf8.config.cache then
6 changes: 5 additions & 1 deletion include/AL/utf8/init.lua
Original file line number Diff line number Diff line change
@@ -25,7 +25,11 @@ local utf8 = {
else
return {}
end
end
end,
conversion = {
uc_lc = nil,
lc_uc = nil
}
},
regex = {
compiletime = {
36 changes: 30 additions & 6 deletions include/AL/utf8/primitives/dummy.lua
Original file line number Diff line number Diff line change
@@ -12,11 +12,6 @@
-- * utf8gmatch(str, regex, all)
-- * utf8gsub(str, regex, repl, limit)
--
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
-- additional functions are available:
-- * utf8upper(s)
-- * utf8lower(s)
--
-- All functions behave as their non UTF-8 aware counterparts with the exception
-- that UTF-8 characters are used instead of bytes for all units.

@@ -76,6 +71,8 @@ local rep = string.rep
local sub = string.sub
local upper = string.upper

local utf8charpattern = '[%z\1-\127\194-\244][\128-\191]*'

local function utf8symbollen(byte)
return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1
end
@@ -494,6 +491,33 @@ local function utf8offset(str, n, bs)

end

local function utf8replace (s, mapping)
if type(s) ~= "string" then
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
end
if type(mapping) ~= "table" then
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
end
local result = utf8.raw.gsub( s, utf8charpattern, mapping )
return result
end

local function utf8upper (s)
return utf8replace(s, utf8.config.conversion.lc_uc)
end

if utf8.config.conversion.lc_uc then
upper = utf8upper
end

local function utf8lower (s)
return utf8replace(s, utf8.config.conversion.uc_lc)
end

if utf8.config.conversion.uc_lc then
lower = utf8lower
end

utf8.len = utf8len
utf8.sub = utf8sub
utf8.reverse = utf8reverse
@@ -514,7 +538,7 @@ for k,v in pairs(string) do
utf8.raw[k] = v
end

utf8.charpattern = '[\0-\127\194-\244][\128-\191]*'
utf8.charpattern = utf8charpattern
utf8.offset = utf8offset
if _VERSION == 'Lua 5.3' then
local utf8_53 = require "utf8"
84 changes: 44 additions & 40 deletions include/AL/utf8/primitives/native.lua
Original file line number Diff line number Diff line change
@@ -1,52 +1,56 @@
return function(utf8)

local ffi = require("ffi")
if ffi.os == "Windows" then
os.setlocale(utf8.config.locale or "english_us.65001", "ctype")
ffi.cdef[[
short towupper(short c);
short towlower(short c);
]]
else
os.setlocale(utf8.config.locale or "C.UTF-8", "ctype")
ffi.cdef[[
int towupper(int c);
int towlower(int c);
]]
end
local ffi = require("ffi")
if ffi.os == "Windows" then
os.setlocale(utf8.config.locale or "english_us.65001", "ctype")
ffi.cdef[[
short towupper(short c);
short towlower(short c);
]]
else
os.setlocale(utf8.config.locale or "C.UTF-8", "ctype")
ffi.cdef[[
int towupper(int c);
int towlower(int c);
]]
end

utf8:require "primitives.dummy"

function utf8.lower(str)
local bs = 1
local nbs
local bytes = utf8.raw.len(str)
local res = {}

while bs <= bytes do
nbs = utf8.next(str, bs)
local cp = utf8.unicode(str, bs, nbs)
res[#res + 1] = ffi.C.towlower(cp)
bs = nbs
if not utf8.config.conversion.uc_lc then
function utf8.lower(str)
local bs = 1
local nbs
local bytes = utf8.raw.len(str)
local res = {}

while bs <= bytes do
nbs = utf8.next(str, bs)
local cp = utf8.unicode(str, bs, nbs)
res[#res + 1] = ffi.C.towlower(cp)
bs = nbs
end

return utf8.char(utf8.config.unpack(res))
end

return utf8.char(utf8.config.unpack(res))
end

function utf8.upper(str)
local bs = 1
local nbs
local bytes = utf8.raw.len(str)
local res = {}

while bs <= bytes do
nbs = utf8.next(str, bs)
local cp = utf8.unicode(str, bs, nbs)
res[#res + 1] = ffi.C.towupper(cp)
bs = nbs
if not utf8.config.conversion.lc_uc then
function utf8.upper(str)
local bs = 1
local nbs
local bytes = utf8.raw.len(str)
local res = {}

while bs <= bytes do
nbs = utf8.next(str, bs)
local cp = utf8.unicode(str, bs, nbs)
res[#res + 1] = ffi.C.towupper(cp)
bs = nbs
end

return utf8.char(utf8.config.unpack(res))
end

return utf8.char(utf8.config.unpack(res))
end

return utf8
3 changes: 2 additions & 1 deletion include/AL/utf8/test.sh
Original file line number Diff line number Diff line change
@@ -12,7 +12,8 @@ for test in \
test/context_runtime.lua \
test/test.lua \
test/test_compat.lua \
test/test_pm.lua
test/test_pm.lua \
test/test_utf8data.lua
do
$lua53 $test
$lua51 $test
3 changes: 2 additions & 1 deletion include/AL/utf8/test/charclass_compiletime.lua
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
local utf8 = require "init"
utf8.config = {
debug = nil, --utf8:require("util").debug
debug = nil,
-- debug = utf8:require("util").debug,
}
utf8:init()

4 changes: 3 additions & 1 deletion include/AL/utf8/test/test.lua
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
local utf8 = require('init')
utf8.config = {
debug = nil, --utf8:require("util").debug
debug = nil,
-- debug = utf8:require("util").debug,
}
utf8:init()

for k,v in pairs(utf8) do
string[k] = v
end
4 changes: 2 additions & 2 deletions include/AL/utf8/test/test_compat.lua
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@ assert(utf8.sub("\000123456789", 8) == "789")
print('+')

assert(utf8.find("123456789", "345") == 3)
a,b = utf8.find("123456789", "345")
local a,b = utf8.find("123456789", "345")
assert(utf8.sub("123456789", a, b) == "345")
assert(utf8.find("1234567890123456789", "345", 3) == 3)
assert(utf8.find("1234567890123456789", "345", 4) == 13)
@@ -102,7 +102,7 @@ print('+')
do
local f = utf8.gmatch("1 2 3 4 5", "%d+")
assert(f() == "1")
co = coroutine.wrap(f)
local co = coroutine.wrap(f)
assert(co() == "2")
end

13 changes: 10 additions & 3 deletions include/AL/utf8/test/test_pm.lua
Original file line number Diff line number Diff line change
@@ -30,11 +30,13 @@ utf8:init()

print('testing pattern matching')

local
function f(s, p)
local i,e = utf8.find(s, p)
if i then return utf8.sub(s, i, e) end
end

local
function f1(s, p)
p = utf8.gsub(p, "%%([0-9])", function (s) return "%" .. (tonumber(s)+1) end)
p = utf8.gsub(p, "^(^?)", "%1()", 1)
@@ -43,6 +45,7 @@ function f1(s, p)
return utf8.sub(s, t[1], t[#t] - 1)
end

local
a,b = utf8.find('', '') -- empty patterns are tricky
assert(a == 1 and b == 0);
a,b = utf8.find('alo', '')
@@ -132,12 +135,15 @@ local abc = utf8.char(range(0, 255));
assert(utf8.len(abc) == 256)
assert(string.len(abc) == 384)

local
function strset (p)
local res = {s=''}
utf8.gsub(abc, p, function (c) res.s = res.s .. c end)
return res.s
end;

local a, b, c, d, e, t

-- local E = utf8.escape
-- assert(utf8.len(strset(E'[%200-%210]')) == 11)

@@ -157,7 +163,7 @@ assert(utf8.match("254 K", "(%d*)K") == "")
assert(utf8.match("alo ", "(%w*)$") == "")
assert(utf8.match("alo ", "(%w+)$") == nil)
assert(utf8.find("(álo)", "%(á") == 1)
local a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$")
a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$")
assert(a == 'âlo alo' and b == 'âl' and c == 'â' and d == 'alo' and e == nil)
a, b, c, d = utf8.match('0123456789', '(.+(.?)())')
assert(a == '0123456789' and b == '' and c == 11 and d == nil)
@@ -203,6 +209,7 @@ x = utf8.gsub("$local utf8=require'init' x=utf8.gsub('alo', '.', utf8.upper)$ as
"$([^$]*)%$", dostring)
assert(x == ' assim vai para ALO')

local s,r
t = {}
s = 'a alo jose joao'
r = utf8.gsub(s, '()(%w+)()', function (a,w,b)
@@ -211,7 +218,7 @@ r = utf8.gsub(s, '()(%w+)()', function (a,w,b)
end)
assert(s == r and t[1] == 1 and t[3] == 3 and t[7] == 4 and t[13] == 4)


local
function isbalanced (s)
return utf8.find(utf8.gsub(s, "%b()", ""), "[()]") == nil
end
@@ -273,7 +280,7 @@ Stepets: ignoring this test because it's probably bug in Lua.
-- end

-- recursive nest of gsubs
function rev (s)
local function rev (s)
return utf8.gsub(s, "(.)(.+)", function (c,s1) return rev(s1)..c end)
end

15 changes: 15 additions & 0 deletions include/AL/utf8/test/test_utf8data.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
local utf8uclc = require('init')
utf8uclc.config = {
debug = nil,
-- debug = utf8:require("util").debug,
conversion = {
uc_lc = setmetatable({}, {__index = function(self, idx) return "l" end}),
lc_uc = setmetatable({}, {__index = function(self, idx) return "u" end}),
}
}
utf8uclc:init()

local assert_equals = require 'test.util'.assert_equals

assert_equals(utf8uclc.lower("фыва"), "llll")
assert_equals(utf8uclc.upper("фыва"), "uuuu")

0 comments on commit ebdfd5f

Please sign in to comment.