Skip to content

Commit

Permalink
post-merge pools code, see issue #878
Browse files Browse the repository at this point in the history
  • Loading branch information
grandsbor committed Jul 8, 2020
1 parent 4c2b27a commit d62b1a9
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 7 deletions.
101 changes: 94 additions & 7 deletions lib/lib_annot.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,43 @@
<?php
require_once('lib_books.php');

class GramFilterType {
const FLT_OR = 0;
const FLT_AND = 1;
}
class GramFilter {
public $match_type;
public $grammemes = array();

public function __construct($gram_str) {
if (strpos($gram_str, "|") !== false) {
if (strpos($gram_str, "&") !== false) {
throw new Exception("Gram filter must be either OR or AND");
} else {
$this->match_type = GramFilterType::FLT_OR;
$this->_fill_grammemes($gram_str, "|");
}
} else {
// either no operators (one grammeme) or AND
$this->match_type = GramFilterType::FLT_AND;
$this->_fill_grammemes($gram_str, "&");
}
}

private function _fill_grammemes($gram_str, $op) {
$this->grammemes = array_map("trim", explode($op, $gram_str));
}
}
class GramFilterSet {
public $filters;

public function __construct($at_separated_str) {
foreach (explode('@', $at_separated_str) as $item) {
$this->filters[] = new GramFilter(trim($item));
}
}
}

class MorphParse {
public $lemma_id = 0;
public $lemma_text;
Expand Down Expand Up @@ -55,6 +92,20 @@ public function is_unknown() {
public function get_inner_gramlist() {
return array_column($this->gramlist, 'inner');
}

public function match(GramFilter $filter) {
$gramlist = $this->get_inner_gramlist();
foreach ($filter->grammemes as $gr) {
$has = ($gr[0] == '!' && !in_array(substr($gr, 1), $gramlist)) || ($gr[0] != '!' && in_array($gr, $gramlist));
if ($has && $filter->match_type == GramFilterType::FLT_OR) {
return true;
}
if (!$has && $filter->match_type == GramFilterType::FLT_AND) {
return false;
}
}
return $filter->match_type == GramFilterType::FLT_AND;
}
}

class MorphParseUnknown extends MorphParse {
Expand Down Expand Up @@ -93,8 +144,7 @@ public function filter_by_lemma($lemma_id, $allow) {
if (($parse->lemma_id == $lemma_id) == $allow)
$newparses[] = $parse;
$this->parses = $newparses;
if (sizeof($this->parses) == 0)
$this->_from_token($this->token_text, true, false);
$this->_check_empty();
}

public function filter_by_parse_index($index_array) {
Expand All @@ -103,8 +153,42 @@ public function filter_by_parse_index($index_array) {
if (in_array($i, $index_array))
$newparses[] = $parse;
$this->parses = $newparses;
if (sizeof($this->parses) == 0)
$this->_from_token($this->token_text, true, false);
$this->_check_empty();
}

public function filter_by_gram_set(GramFilter $gram_filter) {
$newparses = array();
foreach ($this->parses as $parse)
if ($parse->match($gram_filter))
$newparses[] = $parse;
$this->parses = $newparses;
$this->_check_empty();
}

public function match(GramFilter $filter) {
foreach ($this->parses as $parse) {
if (!$parse->match($filter)) {
return false;
}
}
return true;
}

public function match_set(GramFilterSet $flt_set) {
$matched_filters = array();
foreach ($this->parses as $parse) {
$matched = 0;
foreach ($flt_set->filters as $i => $filter) {
if ($parse->match($filter)) {
++$matched;
$matched_filters[] = $i;
}
}
if ($matched != 1) {
return false;
}
}
return sizeof(array_unique($matched_filters)) == sizeof($flt_set->filters);
}

public function remove_parse($lemma_id, $grams) {
Expand All @@ -114,9 +198,7 @@ public function remove_parse($lemma_id, $grams) {
$new_parses[] = $parse;
}
$this->parses = $new_parses;

if (sizeof($this->parses) == 0)
$this->parses[] = new MorphParseUnknown($this->token_text);
$this->_check_empty();
}

public function set_lemma_text($lemma_id, $lemma_text) {
Expand Down Expand Up @@ -292,6 +374,11 @@ private function _make_parses_unique() {
}
$this->parses = $uniq;
}

private function _check_empty() {
if (sizeof($this->parses) == 0)
$this->_from_token($this->token_text, true, false);
}
}

function get_sentence($sent_id) {
Expand Down
41 changes: 41 additions & 0 deletions lib/lib_morph_pools.php
Original file line number Diff line number Diff line change
Expand Up @@ -1129,3 +1129,44 @@ function get_pool_manual_page($type_id) {
return $res[0]['doc_link'];
}

function post_merge_pools() {
// get moderated samples that failed to be merged
$res = sql_query("
SELECT tfr.rev_text, t.grammemes, sample_id, tokens.tf_id AS token_id, answer
FROM morph_annot_moderated_samples ms
LEFT JOIN morph_annot_samples s USING (sample_id)
LEFT JOIN morph_annot_pools p USING (pool_id)
LEFT JOIN morph_annot_pool_types t ON (p.pool_type = t.type_id)
LEFT JOIN tokens USING (tf_id)
LEFT JOIN tf_revisions tfr
ON (tokens.tf_id = tfr.tf_id AND tfr.is_last = 1)
WHERE p.status = ".MA_POOLS_STATUS_ARCHIVED."
AND ms.status IN (".join(',', [MA_SAMPLES_STATUS_OK, MA_SAMPLES_STATUS_ALMOST_OK]).")
AND merge_status = ".MA_MERGE_STATUS_NOT_MERGED."
");
$upd = sql_prepare("
UPDATE morph_annot_moderated_samples
SET merge_status = ".MA_MERGE_STATUS_POST_OK."
WHERE sample_id = ?
LIMIT 1
");
sql_begin();
foreach ($res as $row) {
// check if current parses match pool type:
// every parse must match exactly one gramset
// and every gramset must be matched by at least one parse
$filterset = new GramFilterSet($row['grammemes']);
$correct = $filterset->filters[$row['answer'] - 1];
$pset = new MorphParseSet($row['rev_text']);
if ($pset->is_unknown()) {
continue;
} else if ($pset->match($correct)) {
sql_execute($upd, array($row['sample_id']));
} else if ($pset->match_set($filterset)) {
$pset->filter_by_gram_set($correct);
create_tf_revision(current_revset("Post-merge pools data, see issue #878", 0), $row['token_id'], $pset->to_xml());
sql_execute($upd, array($row['sample_id']));
}
}
sql_commit();
}
11 changes: 11 additions & 0 deletions scripts/ma_pools/post_merge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

if (php_sapi_name() != 'cli')
die("This script is for CLI only");

set_include_path(get_include_path().PATH_SEPARATOR.'/corpus');
require_once('lib/header_ajax.php');
require_once('lib/lib_morph_pools.php');

post_merge_pools();

0 comments on commit d62b1a9

Please sign in to comment.