Skip to content

Commit

Permalink
mathml char data
Browse files Browse the repository at this point in the history
  • Loading branch information
michal-h21 committed Jan 6, 2025
1 parent bb03301 commit aec2c33
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ justinstall: chardef

chardef:
texlua tools/make_chardata.lua > make4ht-char-def.lua
texlua tools/make_mathmlchardata.lua > make4ht-mathml-char-def.lua

version:
echo $(VERSION), $(DATE)
Expand Down
63 changes: 63 additions & 0 deletions tools/make_mathmlchardata.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
-- This file generates Lua table with mapping of Unicode charcodes for different math font styles (bold, italic, bold-italic, etc.)
-- The new version of MathML requires to use different charcodes for different font styles,
-- so we need to replace characters in the MathML output depending on the value of the mathvariant attribute.

kpse.set_program_name "luatex"
local unicode = kpse.find_file("UnicodeData.txt")

local function get_chartype(chartype)
-- remove the extra information from the chartype and convert it to the format used in the mathvariant attribute
return chartype:gsub("MATHEMATICAL ", "")
:gsub("SYMBOL$", "")
:gsub("%a+%s*$", "")
:gsub("SMALL ", "")
:gsub("CAPITAL ", "")
:gsub("%s+$", "")
:gsub("%s+", "-")
:lower()
end


local function parse_unicode(unicode)
local unicode_data = {}
for line in io.lines(unicode) do
-- parse the UnicodeData.txt file to get the base code for the mathematical symbols
local code, chartype, basecode = line:match("^(%x+);([^;]+);[^;]+;[^;]+;[^;]+;([^;]+);")
-- we are interested only in the mathematical symbols
if code and chartype:match("^MATHEMATICAL") then
-- the basecode contains extra <font> tag, we need to remove it and convert the hexadecimal number to decimal
local base = tonumber(basecode:match("(%x+)$"), 16)
-- remove the extra information from the chartype
chartype = get_chartype(chartype)
local char = tonumber(code, 16)
if base and char then
-- we need to store corresponding base code for each symbol in the current font style
local area = unicode_data[base] or {}
area[chartype] = char
unicode_data[base] = area
-- print("unicode", char, chartype, base)
end
end
end
return unicode_data
end

local unicode_data = parse_unicode(unicode)

print "-- This file is autogenerated from tools/make_mathmlchardata.lua"
print "return {"

local to_sort = {}
for base, data in pairs(unicode_data) do
local fields = {}
for chartype, char in pairs(data) do
fields[#fields+1] = string.format("['%s']=%s", chartype, char)
end
to_sort[#to_sort+1] = string.format("[%05i] = {%s},", base, table.concat(fields, ", "))
end

-- sort characters
table.sort(to_sort)
for _, line in ipairs(to_sort) do print(line) end

print "}"

0 comments on commit aec2c33

Please sign in to comment.