-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
mdsills
committed
Sep 24, 2016
0 parents
commit 972c4c4
Showing
9 changed files
with
410 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
demo/*.gz | ||
demo/*.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2016 mdsills | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# cccedict | ||
|
||
## Demo | ||
Download the current CC-CEDICT file from http://www.mdbg.net/chindict/chindict.php?page=cc-cedict into the demo folder. | ||
|
||
``` | ||
cd demo | ||
wget -O cedict.gz http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz | ||
php -f index.php | ||
``` | ||
|
||
## About | ||
Reads from a CC-CEDICT Chinese dictionary file, and outputs structured data. | ||
|
||
## Limitations | ||
- Requires a lot of memory - a piece-by-piece interface would be handy. | ||
- The CC-CEDICT pinyin is not necessarily what you expect - there are stub functions in ```Entry.php``` to rewrite this. | ||
- There is little flexibility - perhaps a ```setOptions()``` on ```Parser.php```? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<?php | ||
|
||
/** | ||
* @see http://www.php-fig.org/psr/psr-4/examples/ | ||
*/ | ||
spl_autoload_register(function ($class) { | ||
|
||
$prefix = 'CcCedict\\'; | ||
$base_dir = __DIR__ . '/../src/CcCedict/'; | ||
$len = strlen($prefix); | ||
|
||
if (strncmp($prefix, $class, $len) !== 0) { | ||
return; | ||
} | ||
|
||
$relative_class = substr($class, $len); | ||
$file = $base_dir . str_replace('\\', '/', $relative_class) . '.php'; | ||
|
||
if (file_exists($file)) { | ||
require $file; | ||
} | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
<?php | ||
|
||
ini_set('memory_limit', '512M'); | ||
mb_internal_encoding('UTF-8'); | ||
set_time_limit(300); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<?php | ||
|
||
require('config.php'); | ||
require('autoload.php'); | ||
|
||
use CcCedict\Parser; | ||
use CcCedict\Unpacker; | ||
|
||
// UNPACKING | ||
// file comes from http://www.mdbg.net/chindict/chindict.php?page=cc-cedict | ||
// either zipped or gzipped - we need to unpack it | ||
$unpacker = new Unpacker(); | ||
|
||
// optionally set a directory for the Unpacker to unpack into | ||
// $unpacker->setTempDirectory('/tmp'); | ||
|
||
// tell Unpacker the file to operate on | ||
$unpacker->setInputFile(__DIR__ . '/cedict.gz'); | ||
|
||
// do the unpack, and tell us where to find the uncompressed file | ||
$filePath = $unpacker->unpack(); | ||
|
||
// PARSING | ||
// now we can parse it | ||
$parser = new Parser(); | ||
|
||
// tell the parser where the uncompressed data is | ||
$parser->setFilePath($filePath); | ||
|
||
// do the parse | ||
$output = $parser->parse(); | ||
|
||
// print the output | ||
print_r($output); | ||
|
||
// remove the temporary file | ||
$unpacker->removeOutputFile(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
<?php | ||
|
||
namespace CcCedict; | ||
|
||
class Entry | ||
{ | ||
/** | ||
* holds the data about one entry | ||
* | ||
* @var array | ||
*/ | ||
private $data; | ||
|
||
/** | ||
* sets the data values from the parser's match data | ||
* | ||
* @param array $match | ||
*/ | ||
public function setData($match) | ||
{ | ||
$this->data['original'] = $match[0]; | ||
$this->data['traditional'] = $match[1]; | ||
$this->data['simplified'] = $match[2]; | ||
$this->data['pinyin'] = $match[3]; | ||
$this->data['pinyinNumeric'] = $this->convertToPinyinDiacritic($match[3]); | ||
$this->data['pinyinDiacritic'] = $this->convertToPinyinDiacritic($match[3]); | ||
$this->data['english'] = $match[4]; | ||
} | ||
|
||
/** | ||
* gets a basic report of the entry content | ||
* | ||
* @return array | ||
*/ | ||
public function getBasic() | ||
{ | ||
$this->data['english'] = explode('/', $this->data['english']); | ||
$this->data['traditionalChars'] = $this->extractChineseChars($this->data['traditional']); | ||
$this->data['simplifiedChars'] = $this->extractChineseChars($this->data['simplified']); | ||
|
||
return $this->data; | ||
} | ||
|
||
/** | ||
* extracts the Chinese characters | ||
* | ||
* @param string $chinese String with Chinese characters in it | ||
* @return array | ||
*/ | ||
private function extractChineseChars($chinese) | ||
{ | ||
preg_match_all('#\p{Lo}#u', $chinese, $matches); | ||
|
||
return $matches[0]; | ||
} | ||
|
||
/** | ||
* Converts the CC-CEDICT pinyin to more familar numeric pinyin | ||
* | ||
* unimplemented | ||
* definitely worth reading https://cc-cedict.org/wiki/format:syntax before | ||
* getting into this | ||
* | ||
* @todo | ||
* @param string $pinyin | ||
* @return string | ||
*/ | ||
private function convertToPinyinNumeric($pinyin) | ||
{ | ||
} | ||
|
||
/** | ||
* Converts the CC-CEDICT pinyin to accented/diacritic-marked pinyin | ||
* | ||
* unimplemented | ||
* definitely worth reading https://cc-cedict.org/wiki/format:syntax before | ||
* getting into this | ||
* | ||
* @todo | ||
* @param string $pinyin | ||
* @return string | ||
*/ | ||
private function convertToPinyinDiacritic($pinyin) | ||
{ | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
<?php | ||
|
||
namespace CcCedict; | ||
|
||
/** | ||
* Class for parsing the CC-CEDICT dictionary | ||
*/ | ||
class Parser | ||
{ | ||
/** | ||
* path/filename to the CC-CEDICT data | ||
* | ||
* @var string | ||
*/ | ||
private $filePath; | ||
|
||
/** | ||
* Sets the path/filename containing the raw uncompressed CC-CEDICT data | ||
* | ||
* @param string $filePath | ||
*/ | ||
public function setFilePath($filePath) | ||
{ | ||
$this->filePath = $filePath; | ||
} | ||
|
||
/** | ||
* Parses the data | ||
* | ||
* @return array | ||
*/ | ||
public function parse() | ||
{ | ||
$skippedLines = []; | ||
$parsedLines = []; | ||
|
||
$lines = $this->readLines(); | ||
|
||
foreach ($lines as $line) { | ||
$parsedLine = $this->parseLine($line); | ||
|
||
if ($parsedLine) { | ||
$parsedLines[] = $parsedLine; | ||
} else { | ||
$skippedLines[] = $line; | ||
} | ||
} | ||
|
||
return [ | ||
'numSkipped' => count($skippedLines), | ||
'numParsed' => count($parsedLines), | ||
'parsedLines' => $parsedLines, | ||
'skippedLines' => $skippedLines, | ||
]; | ||
} | ||
|
||
/** | ||
* reads lines from the file, and removes comments | ||
* | ||
* @return array | ||
*/ | ||
private function readLines() | ||
{ | ||
$outputLines = []; | ||
$lines = file($this->filePath); | ||
|
||
foreach ($lines as $line) { | ||
if (strpos($line, '#') !== 0) { | ||
$outputLines[] = $line; | ||
} | ||
} | ||
|
||
return $outputLines; | ||
} | ||
|
||
/** | ||
* parses a single line from the file, checking to see it meets basic dictionary spec | ||
* | ||
* @param string $line A line from the CC-CEDICT file | ||
* @return false|array | ||
*/ | ||
private function parseLine($line) | ||
{ | ||
$line = trim($line); | ||
|
||
// Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/ | ||
// 中國 中国 [Zhong1 guo2] /China/Middle Kingdom/ | ||
if (preg_match('#(.+) (.+) \[(.+)\] /(.*)/#', $line, $match)) { | ||
$entry = new Entry(); | ||
$entry->setData($match); | ||
|
||
return $entry->getBasic(); | ||
} else { | ||
return false; | ||
} | ||
} | ||
} |
Oops, something went wrong.