From 9209c2919f7660f93e1bab9ddeafbef57431aea5 Mon Sep 17 00:00:00 2001 From: "kwrobel.eth" Date: Thu, 30 Jan 2025 20:07:28 +0100 Subject: [PATCH] more domain specific tokens (#585) --- .../data/domain_specific_dictionary.txt | 110 +++++++++++++++++- .../data/tests/tokenization_quality.json | 79 +++++++++++++ apps/api.nameai.dev/pyproject.toml | 2 +- apps/api.nameai.dev/tests/test_ngrams.py | 21 ++++ apps/api.nameai.dev/tests/test_tokenizer.py | 24 ++++ 5 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 apps/api.nameai.dev/nameai/data/tests/tokenization_quality.json diff --git a/apps/api.nameai.dev/nameai/data/domain_specific_dictionary.txt b/apps/api.nameai.dev/nameai/data/domain_specific_dictionary.txt index 08878cd7..ffa8e3b3 100644 --- a/apps/api.nameai.dev/nameai/data/domain_specific_dictionary.txt +++ b/apps/api.nameai.dev/nameai/data/domain_specific_dictionary.txt @@ -1,4 +1,7 @@ +256 +3dns aah +abebe abhishek acai accel @@ -7,8 +10,10 @@ adria adwords aftermarket agrawal +ai ajay aleks +alex algo alibaba alpinist @@ -22,6 +27,7 @@ anima anonym anonymizer arbitrage +arbitrum aristo athleisure att @@ -40,6 +46,7 @@ badass baddie baileys balaji +balancer balenciaga baller ballers @@ -55,12 +62,16 @@ beasties beaucoup becca bellagio +ben benjamins +binance bisou bisous bitchin bizzare blm +blockchain +blockful bloke blox bnb @@ -95,6 +106,7 @@ capex capitano capos carpool +ccip cdn celeb celebs @@ -109,15 +121,19 @@ christies chronograph chubbies clicquot +clowes cmon coder cognito collab +coltron composability contessa cookware coom +cory courtside +cow cowabunga cowbell crackhead @@ -134,11 +150,15 @@ curveball cybersecurity daddio danke +dao +dapp dapp datuk +davidmihal davinci dawg dawgs +de debeers defcon defi @@ -147,6 +167,7 @@ defund delegator delish deloitte +demarais dennys deployer desantis @@ -155,6 +176,7 @@ dhruv dibs digi digitalization +domico dominicano dongfeng doorn @@ -165,12 +187,22 @@ dutchy ebooking ecommerce econ +efp egirl +eigen eightball elon +ens +ensip +eric +eskender esports esposa +est +ether +ethereum ethers +etherscan etsy evm expat @@ -200,16 +232,21 @@ funghi fungibility fungible fyre +gabrielsen gaijin gangbusters gangnam gangstas +gaspar gelato git +giveth gnar +gnosis godspeed golem grails +greg grifter groovin grump @@ -238,6 +275,7 @@ hotbox hotpot hotties hoverboard +hu huawei hublot hulu @@ -251,6 +289,7 @@ ied illiquid influencer influencers +inoue intl ironworker isation @@ -262,19 +301,26 @@ jamon janky japon jarhead +jeff +jesse jigga jollibee +jordan jordans jpegs jpgs juul kaizen kanpai +karapetsas +karpatkey kawaii kd kek kempinski +kevin khaosan +khori killaz kimpton kingmaker @@ -287,18 +333,26 @@ krazy krunk kundalini kunta +l1 +l2 +labs lacoste ladbrokes lagunitas lambo larp latam +lau leche +lefteris lenovo +leon lfg lgbtq libs +linea lipo +liu lmao lofi loreal @@ -307,22 +361,33 @@ lowlife macallan macys mailroom +mainnet +makeig +makoto mami manana +mann +mask matic maximalist mckenna mcmuffin +mcmxci +mead meanie meetup megatrends +mely mercato merch merica merkle +meta mezcal mgmt mignon +mihal +mike milady millennials minter @@ -332,20 +397,26 @@ mngmt mohd monetization monkfish +moo moonie moonwalker mortage +muhammed munger mystik nakheel +name narco neet negroni nespresso +netto newswire nft nikhil +nimi nobu +node nomics norcal normies @@ -355,13 +426,16 @@ nutz nytimes oclock oconnor +offchain omfg +onchain oundation palooza panera panerai panopticon patek +patricio paulaner payback peachy @@ -372,6 +446,7 @@ perp pesce pfp pharma +phiz pho pics piguet @@ -380,15 +455,18 @@ pimms pimpin playbook playmakers +poap poc podcaster policia +pollak pollo poolside poon porker postgame poutine +prem presale primadonna primero @@ -413,6 +491,7 @@ reiki remax restaurante revolucion +ric rideshare ridgemont rino @@ -422,14 +501,18 @@ rocketeer rocknroll romancer roshan +rotki rsvp salut +sande sanfrancisco saurus sawadee saylor schwab +serenae shawty +she shiller shiznit shizzle @@ -438,11 +521,17 @@ shortie shroom sicilia silverback +simona simp sinha +siwe sixers skillz +skril +skriloff slapper +slobo +slobodnik snakeoil snowboarder snowcat @@ -454,26 +543,34 @@ souljah spacelab spacelabs spacex +spence spicey spliff ssn stacker stakeholder +staking stargaze starlink steph +subnames sugarplum sunil sunnies suntory +super supercool superfood superfoods +swap +syd szabo taha tajmahal takaful +talbert tanqueray +tanrıkulu taormina tarun tastic @@ -506,23 +603,32 @@ twinz ultrafast unbanked univ +unruggable unscripted untethered upenn usdc +v2 validators vamos +van veg +viem vimeo +vitalik +vitalik voxel voxels vroom +wagmi wagyu wahoo wanker wearables +web3 weeb whitelists +whittaker wifey wizz woah @@ -531,6 +637,7 @@ wolfie wonton workspace workwear +worthalter wsj wynwood xchange @@ -548,5 +655,6 @@ yolo zayn zhan zhuan +zk zoomer -zuck \ No newline at end of file +zuck diff --git a/apps/api.nameai.dev/nameai/data/tests/tokenization_quality.json b/apps/api.nameai.dev/nameai/data/tests/tokenization_quality.json new file mode 100644 index 00000000..63d7807d --- /dev/null +++ b/apps/api.nameai.dev/nameai/data/tests/tokenization_quality.json @@ -0,0 +1,79 @@ +{ + "vitalik": ["vitalik"], + "ensv2": ["ens", "v2"], + "ethereum": ["ethereum"], + "alexvandesande": ["alex", "van", "de", "sande"], + "kevingaspar": ["kevin", "gaspar"], + "mikedemarais": ["mike", "demarais"], + "jordanspence": ["jordan", "spence"], + "metamask": ["meta", "mask"], + "web3": ["web3"], + "poap": ["poap"], + "siwe": ["siwe"], + "efp": ["efp"], + "corygabrielsen": ["cory", "gabrielsen"], + "giveth": ["giveth"], + "dappnode": ["dapp", "node"], + "coltron": ["coltron"], + "simona": ["simona"], + "linea": ["linea"], + "nameai": ["name", "ai"], + "premmakeig": ["prem", "makeig"], + "unruggablelabs": ["unruggable", "labs"], + "staking": ["staking"], + "superphiz": ["super", "phiz"], + "gnosisdao": ["gnosis", "dao"], + "balancer": ["balancer"], + "cowswap": ["cow", "swap"], + "she256": ["she", "256"], + "nimi": ["nimi"], + "lefteris": ["lefteris"], + "lefteriskarapetsas": ["lefteris", "karapetsas"], + "rotki": ["rotki"], + "blockchain": ["blockchain"], + "onchain": ["onchain"], + "offchain": ["offchain"], + "davidmihal": ["davidmihal"], + "mihal": ["mihal"], + "jefflau": ["jeff", "lau"], + "enslabs": ["ens", "labs"], + "makotoinoue": ["makoto", "inoue"], + "gregskril": ["greg", "skril"], + "gregskriloff": ["greg", "skriloff"], + "alextnetto": ["alex", "", "netto"], + "blockful": ["blockful"], + "liubenben": ["liu", "ben", "ben"], + "ricmoo": ["ric", "moo"], + "slobo": ["slobo"], + "alexslobodnik": ["alex", "slobodnik"], + "wagmi": ["wagmi"], + "viem": ["viem"], + "jessepollak": ["jesse", "pollak"], + "karpatkey": ["karpatkey"], + "eigenmann": ["eigen", "mann"], + "serenae": ["serenae"], + "erichu": ["eric", "hu"], + "estmcmxci": ["est", "mcmxci"], + "subnames": ["subnames"], + "l2": ["l2"], + "l1": ["l1"], + "mainnet": ["mainnet"], + "clowes": ["clowes"], + "ccip": ["ccip"], + "ensip": ["ensip"], + "muhammedtanrıkulu": ["muhammed", "tanrıkulu"], + "binance": ["binance"], + "khoriwhittaker": ["khori", "whittaker"], + "leontalbert": ["leon", "talbert"], + "eskender": ["eskender"], + "domico": ["domico"], + "3dns": ["3dns"], + "arbitrum": ["arbitrum"], + "etherscan": ["etherscan"], + "ether": ["ether"], + "patricioworthalter": ["patricio", "worthalter"], + "zk": ["zk"], + "mely": ["mely"], + "eskenderabebe": ["eskender", "abebe"], + "sydmead": ["syd", "mead"] +} \ No newline at end of file diff --git a/apps/api.nameai.dev/pyproject.toml b/apps/api.nameai.dev/pyproject.toml index 9cf00d24..26e58987 100644 --- a/apps/api.nameai.dev/pyproject.toml +++ b/apps/api.nameai.dev/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "nameai" -version = "0.1.4" +version = "0.1.5" description = "NameHash NameAI API" authors = ["NameHash Team "] maintainers = ["NameHash Team "] diff --git a/apps/api.nameai.dev/tests/test_ngrams.py b/apps/api.nameai.dev/tests/test_ngrams.py index 82fa41c8..5598eeac 100644 --- a/apps/api.nameai.dev/tests/test_ngrams.py +++ b/apps/api.nameai.dev/tests/test_ngrams.py @@ -120,3 +120,24 @@ def test_gap_prob(): if None not in (tok1, tok2): break assert ngrams.sequence_probability(tok1) < ngrams.sequence_probability(tok2) + + +def test_all_tokenizer_quality(): + with init_ngrams_tokenizer([]) as (ngrams, tokenizer): + from nameai.data import get_resource_path + import json + + # Load tokenization quality test cases + with open(get_resource_path('tests/tokenization_quality.json')) as f: + quality_tests = json.load(f) + + for input_text, expected_tokens in quality_tests.items(): + tokenized_labels = list(tokenizer.tokenize(input_text)) + expected_tuple = tuple(expected_tokens) + if expected_tuple != tokenized_labels[0]: + print(input_text) + print(tokenized_labels[0]) + print(expected_tuple) + print(ngrams.sequence_probability(tokenized_labels[0])) + print(ngrams.sequence_probability(expected_tuple)) + print() diff --git a/apps/api.nameai.dev/tests/test_tokenizer.py b/apps/api.nameai.dev/tests/test_tokenizer.py index 535a7163..d7ca1de4 100644 --- a/apps/api.nameai.dev/tests/test_tokenizer.py +++ b/apps/api.nameai.dev/tests/test_tokenizer.py @@ -209,3 +209,27 @@ def test_all_tokenizer_quality(): multiword = multiword.strip() tokenized_labels = list(tokenizer.tokenize(multiword)) assert all([len(tokenized_label) > 1 for tokenized_label in tokenized_labels]) + + +def test_all_tokenizer_quality2(): + with init_tokenizer([]) as tokenizer: + from nameai.data import get_resource_path + import json + + # Load tokenization quality test cases + with open(get_resource_path('tests/tokenization_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_text, expected_tokens in quality_tests.items(): + tokenized_labels = list(tokenizer.tokenize(input_text)) + expected_tuple = tuple(expected_tokens) + if expected_tuple not in tokenized_labels: + failures.append(f"\nInput: '{input_text}'\nExpected: {expected_tokens}\nGot: {tokenized_labels}") + + if failures: + print('\n=== Tokenization Quality Test Failures ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') + assert False, 'Some tokenization quality tests failed. See above for details.'