Skip to content

Commit

Permalink
introduces optional affix-separation
Browse files Browse the repository at this point in the history
- input-blocks can now be configured through the new bool $splitAffixIntoSymbols
  paramter to generate prepended and appended Tokens instead of prefix/suffix
  attributes, allowing easier parsing of naturla-language like syntaxes
  • Loading branch information
ricwein committed Mar 25, 2020
1 parent ea9f844 commit 7647d16
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 13 deletions.
13 changes: 12 additions & 1 deletion src/InputSymbols/Block.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@ class Block
private Delimiter $symbolClose;

private bool $shouldTokenizeContent;
private bool $splitAffixIntoSymbols;

/**
* Block constructor.
* @param string $symbolOpen
* @param string|null $symbolClose
* @param bool $shouldTokenizeContent
* @param bool $splitAffixIntoSymbols
*/
public function __construct(string $symbolOpen, ?string $symbolClose, bool $shouldTokenizeContent)
public function __construct(string $symbolOpen, ?string $symbolClose, bool $shouldTokenizeContent, bool $splitAffixIntoSymbols = false)
{
$this->symbolOpen = new Delimiter($symbolOpen);

Expand All @@ -26,6 +28,7 @@ public function __construct(string $symbolOpen, ?string $symbolClose, bool $shou
}

$this->shouldTokenizeContent = $shouldTokenizeContent;
$this->splitAffixIntoSymbols = $splitAffixIntoSymbols;
}

/**
Expand Down Expand Up @@ -66,6 +69,14 @@ public function shouldTokenizeContent(): bool
return $this->shouldTokenizeContent;
}

/**
* @return bool
*/
public function splitAffixIntoSymbols(): bool
{
return $this->splitAffixIntoSymbols;
}

public function __toString()
{
return "{$this->open()}{$this->close()}";
Expand Down
36 changes: 30 additions & 6 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use ricwein\Tokenizer\InputSymbols\Block;
use ricwein\Tokenizer\InputSymbols\Delimiter;
use ricwein\Tokenizer\Result\BaseToken;
use ricwein\Tokenizer\Result\TokenStream;
use ricwein\Tokenizer\Result\BlockToken;
use ricwein\Tokenizer\Result\Token;
Expand Down Expand Up @@ -89,7 +90,7 @@ private function process(string $input, int $depth, int $line): array
// abort tokenizing after reaching the max block depth
// just return the input string as the remaining symbol
if ($this->maxDepth > 0 && $depth >= $this->maxDepth) {
return [new Token($input, null)];
return [new Token($input, null, $line)];
}

/** @var BlockToken[]|Token[] $result */
Expand All @@ -98,6 +99,7 @@ private function process(string $input, int $depth, int $line): array
/** @var array|null $openBlocks 'block' => BlockToken, 'startOffset' => int */
$openBlocks = [];

/** @var BaseToken|null $lastSymbol */
$lastSymbol = null;

/** @var Delimiter|null $lastDelimiter */
Expand Down Expand Up @@ -176,9 +178,18 @@ private function process(string $input, int $depth, int $line): array

$resultBlock = new BlockToken($block, $lastDelimiter, $line);
if ($lastOffset < $offset) {
$prefix = ltrim(substr($input, $lastOffset, $offset - $lastOffset));
$prefix = trim(substr($input, $lastOffset, $offset - $lastOffset));
if (!empty($prefix)) {
$resultBlock->withPrefix($prefix);
if ($block->splitAffixIntoSymbols()) {
$lastSymbol = new Token($prefix, $lastDelimiter, $line);

$resultBlock->setDelimiter(null);
$lastDelimiter = null;

$result[] = $lastSymbol;
} else {
$resultBlock->withPrefix($prefix);
}
}
}

Expand All @@ -204,8 +215,15 @@ private function process(string $input, int $depth, int $line): array

// encounter of symbol directly after an block (no delimiter in between)
if ($lastSymbol instanceof BlockToken) {

if (!empty($content)) {
$lastSymbol->withSuffix($content);

if ($lastSymbol->block()->splitAffixIntoSymbols()) {
$result[] = new Token($content, null, $line);
} else {
$lastSymbol->withSuffix($content);
}

}

// we need to reset the last-symbol, since we processed the
Expand Down Expand Up @@ -233,15 +251,21 @@ private function process(string $input, int $depth, int $line): array
}
}

// handle remaining tokens
$remaining = ltrim($remaining, ' ');
if (strlen($remaining) > 0) {
if ($lastSymbol instanceof BlockToken) {
$lastSymbol->withSuffix($remaining);

if ($lastSymbol->block()->splitAffixIntoSymbols()) {
$result[] = new Token(ltrim($remaining), null, $line);
} else {
$lastSymbol->withSuffix($remaining);
}

} else {
$result[] = new Token($remaining, $lastDelimiter, $line);
}
}

return $result;
}

Expand Down
41 changes: 35 additions & 6 deletions tests/TokenizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@ protected function setUp(): void

$delimiter = [new Delimiter('.'), new Delimiter('|'), new Delimiter(',')];
$blocks = [
new Block('[', ']', true),
new Block('(', ')', true),
new Block('{', '}', false),
new Block('\'', null, false),
new Block('"', null, false),
new Block('[', ']', true, false),
new Block('(', ')', true, false),
new Block('{', '}', false, false),
new Block('\'', null, false, false),
new Block('"', null, false, false),

new Block('{{', '}}', false, true),
new Block('{%', '%}', true, true),
];

$this->tokenizer = new Tokenizer($delimiter, $blocks);
Expand Down Expand Up @@ -291,7 +294,7 @@ public function testLineTracking()
$testString = file_get_contents(__DIR__ . '/test.txt');
$expected = [
new Token('first', null),
(new BlockToken(new Block('(', ')', true), new Delimiter('.'), 2))->withPrefix('second' . PHP_EOL)->withSymbols([
(new BlockToken(new Block('(', ')', true), new Delimiter('.'), 2))->withPrefix('second')->withSymbols([
new Token('line:2', null, 2),
])->withSuffix(PHP_EOL . 'end' . PHP_EOL),
];
Expand All @@ -307,7 +310,33 @@ public function testLineTracking()
new Token('end', new Delimiter(PHP_EOL), 3),
];
$this->assertEquals(new TokenStream($expected), $customTokenizer->tokenize($testString));
}

public function testAffixSplitting()
{
$testString = "before {{ test }} after";
$expected = [
new Token('before', null),
(new BlockToken(new Block('{{', '}}', false, true), null))->withSymbols([
new Token(' test ', null),
]),
new Token('after', null),
];
$this->assertEquals(new TokenStream($expected), $this->tokenizer->tokenize($testString));

$testString = "before.one {% test.first %} 'after'";
$expected = [
new Token('before', null),
new Token('one', new Delimiter('.')),
(new BlockToken(new Block('{%', '%}', true, true), null))->withSymbols([
new Token(' test', null),
new Token('first ', new Delimiter('.')),
]),
(new BlockToken(new Block('\'', '\'', false), null))->withSymbols([
new Token('after', null),
]),
];
$this->assertEquals(new TokenStream($expected), $this->tokenizer->tokenize($testString));
}

}

0 comments on commit 7647d16

Please sign in to comment.