Skip to content

Commit

Permalink
Merge pull request #25 from paceaux/paceaux/issue24
Browse files Browse the repository at this point in the history
Paceaux/issue24
  • Loading branch information
paceaux authored Apr 18, 2024
2 parents 7bfdb41 + 424dda1 commit 2ff0ee9
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 7 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/lint-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Lint

on:
push:
branches:
- main
- develop
pull_request:
branches:
- main
- develop

jobs:
run-all:
runs-on: ubuntu-latest
steps:
- name: Check out Git repository
uses: actions/checkout@v4

- name: Set up Node
uses: actions/setup-node@v4
with:
node-version: 18
- run: npm ci
- run: npm run lint
- run: npm run test


20 changes: 20 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Lint

on: [push]

jobs:
run-linters:
runs-on: ubuntu-latest
steps:
- name: Check out Git repository
uses: actions/checkout@v4

- name: Set up Node
uses: actions/setup-node@v4
with:
node-version: 18

- run: npm ci
- run: npm run lint


11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ em-dash, period, comma, semicolon, colon, bang, question mark, interrobang, Span
**Returns**
`boolean`

#### `hasSymbols(string)`
determines if string contains symbols

**Parameters**
| name | type | Description |
| --- |--- | --- |
| string | string | |

**Returns**
`boolean`

#### `hasSpace(string)`
determines if a string has a space

Expand Down
9 changes: 7 additions & 2 deletions src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
type RegexQuery = string;

/** a string that can be used by a RegExp to identify start and ends of sentences */
const punctuations: RegexQuery = "\\.,;:!?‽¡¿⸘()\\[\\]{}<>’'«»…\"\n\t\r";
const punctuations: RegexQuery = "\\.,;:!?‽¡¿⸘()\\[\\]{}<>’'«»…‘“”\"\n\t\r";

/** a string that can be used by a RegExp to identify anything that looks like a symbol. */
/* eslint-disable-next-line quotes,max-len */
const symbols: RegexQuery = "\\$\\^!-#%-*,-/:;?@\\[-\\]_\\{\\}\\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65";

/** a string that can be used by a RegExp to identify symbols that can separate words */
const wordSeparators: RegexQuery = '—\\.,;:!?‽¡¿⸘()\\[\\]{}<>«»…"\\s';
const wordSeparators: RegexQuery = '—\\.,;:!?‽¡¿⸘()\\[\\]{}<>«»…‘“”"\\s';

export {
punctuations,
wordSeparators,
symbols,
RegexQuery,
};
2 changes: 2 additions & 0 deletions src/functions.ngrams.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import {
hasPunctuation,
hasSymbols,
hasSpace,
getWords,
} from './functions.tokenizers';
Expand All @@ -24,6 +25,7 @@ function getNGrams(text:string, gramSize: number = 2) : NGramSequence {
if (
!hasPunctuation(substring)
&& !hasSpace(substring)
&& !hasSymbols(substring)
) {
bigrams.push(substring);
}
Expand Down
17 changes: 15 additions & 2 deletions src/functions.tokenizers.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import {
wordSeparators,
punctuations,
symbols,
} from './constants';
import { Word } from './types';

/**
* @description determins if string contains punctuation
* @description determines if string contains punctuation
* @param {string} text - string to check for punctuation
* @returns {boolean} - true if string contains punctuation
*/
Expand All @@ -16,7 +17,18 @@ function hasPunctuation(text: string): boolean {
}

/**
* @description determins if a string has a space
* @description determines if a string contains symbols such as #, $ or ^
* @param {string} text - string to check for symbols
* @returns {boolean} - true if string contains symbols
*/
function hasSymbols(text: string): boolean {
const symbolRegEx = new RegExp(`([${symbols}])`, 'g');

return symbolRegEx.test(text);
}

/**
* @description determines if a string has a space
* @param {string} text - string to check for space
* @returns {boolean} - true if string contains space
*/
Expand Down Expand Up @@ -59,6 +71,7 @@ function getWords(text:string) : Word[] {

export {
hasPunctuation,
hasSymbols,
hasSpace,
sanitizeText,
getWords,
Expand Down
4 changes: 3 additions & 1 deletion src/methodius.class.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { NGram } from './types';
import { punctuations, wordSeparators } from './constants';
import {
hasPunctuation, hasSpace, sanitizeText, getWords,
hasPunctuation, hasSymbols, hasSpace, sanitizeText, getWords,
} from './functions.tokenizers';
import { getMeanWordSize, getMedianWordSize } from './functions.metrics.words';
import {
Expand Down Expand Up @@ -53,6 +53,8 @@ export default class Methodius {

static hasPunctuation = hasPunctuation;

static hasSymbols = hasSymbols;

static hasSpace = hasSpace;

static sanitizeText = sanitizeText;
Expand Down
1 change: 1 addition & 0 deletions test/ts/class.static.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ describe('static class members', () => {
expect(Methodius).toHaveProperty('punctuations');
expect(Methodius).toHaveProperty('wordSeparators');
expect(Methodius).toHaveProperty('hasPunctuation');
expect(Methodius).toHaveProperty('hasSymbols');
expect(Methodius).toHaveProperty('hasSpace');
expect(Methodius).toHaveProperty('sanitizeText');
expect(Methodius).toHaveProperty('getMeanWordSize');
Expand Down
4 changes: 2 additions & 2 deletions test/ts/constants.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ import { punctuations, wordSeparators } from '../../src/constants';

it('has all the common punctuations', () => {
expect(punctuations).toEqual(
"\\.,;:!?‽¡¿⸘()\\[\\]{}<>’'«»…\"\n\t\r",
"\\.,;:!?‽¡¿⸘()\\[\\]{}<>’'«»…‘“”\"\n\t\r",
);
});
it('has common word Separators', () => {
expect(wordSeparators).toEqual(
'—\\.,;:!?‽¡¿⸘()\\[\\]{}<>«»…"\\s',
'—\\.,;:!?‽¡¿⸘()\\[\\]{}<>«»…‘“”"\\s',
);
});
21 changes: 21 additions & 0 deletions test/ts/functions.ngrams.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,27 @@ describe('getNGrams', () => {
'rld',
]);
});
it('will not have any ngrams with non-letter non-numbers', () => {
expect(getNGrams('foot-ball')).toEqual(['fo', 'oo', 'ot', 'ba', 'al', 'll']);
expect(getNGrams('foot_ball', 3)).toEqual(['foo', 'oot', 'bal', 'all']);
expect(getNGrams('foot-_ball')).toEqual(['fo', 'oo', 'ot', 'ba', 'al', 'll']);
expect(getNGrams('foot-_ball_')).toEqual(['fo', 'oo', 'ot', 'ba', 'al', 'll']);
expect(getNGrams('#football')).toEqual(['fo', 'oo', 'ot', 'tb', 'ba', 'al', 'll']);
expect(getNGrams('footb@ll')).toEqual(['fo', 'oo', 'ot', 'tb', 'll']);
expect(getNGrams('footb@11')).toEqual(['fo', 'oo', 'ot', 'tb', '11']);
});
it('will work on Hebrew', () => {
expect(getNGrams('שלום')).toEqual(['של', 'לו', 'ום']);
});
it('will work on Ukrainian', () => {
expect(getNGrams('привіт')).toEqual(['пр', 'ри', 'ив', 'ві', 'іт']);
});
it('will not get tripped up by quotes', () => {
expect(getNGrams('hello "world"')).toEqual(['he', 'el', 'll', 'lo', 'wo', 'or', 'rl', 'ld']);
expect(getNGrams('hello \'world\'')).toEqual(['he', 'el', 'll', 'lo', 'wo', 'or', 'rl', 'ld']);
expect(getNGrams('hello “world”')).toEqual(['he', 'el', 'll', 'lo', 'wo', 'or', 'rl', 'ld']);
expect(getNGrams('«hello world»')).toEqual(['he', 'el', 'll', 'lo', 'wo', 'or', 'rl', 'ld']);
});
});
describe('getWordNGrams', () => {
it('will get a default size of 2 ', () => {
Expand Down
11 changes: 11 additions & 0 deletions test/ts/functions.tokenizers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { describe, expect, it } from '@jest/globals';
import {
hasPunctuation,
hasSpace,
hasSymbols,
sanitizeText,
getWords,
} from '../../src/functions.tokenizers';
Expand All @@ -14,6 +15,16 @@ describe('tokenization', () => {
expect(hasPunctuation('hello')).toBe(false);
expect(hasPunctuation('Don\'t shouldn\'t')).toBe(true);
});
it('can recognize symbols', () => {
expect(hasSymbols('$3 for food')).toBe(true);
expect(hasSymbols('5% for food')).toBe(true);
expect(hasSymbols('#FoodFun')).toBe(true);
expect(hasSymbols('^FoodFun')).toBe(true);
expect(hasSymbols('FoodFun-_')).toBe(true);
expect(hasSymbols('FoodFun')).toBe(false);
expect(hasSymbols('фвавф')).toBe(false);
expect(hasSymbols('شبيش')).toBe(false);
});
it('can determine if a bit of text has spaces', () => {
expect(hasSpace('hello world')).toBe(true);
expect(hasSpace('hello')).toBe(false);
Expand Down

0 comments on commit 2ff0ee9

Please sign in to comment.