Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ secured_data/import
*.log
*.backup
*.swp
.idea/
5 changes: 4 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,8 @@
"engine/include/define/def_flags.php",
"engine/include/define/def_roles.php"
]
}
},
"require": {
"ext-json": "*"
}
}
161 changes: 161 additions & 0 deletions engine/include/export/ConllAndJsonFactory.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
<?php


class ConllAndJsonFactory {

function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id)
{
/**
* Create a cache for 'token from' to boost processing
*/
$cache_tokens_from = [];
foreach ($tokens as $token) {
$cache_tokens_from += [$token['from'] => [
"to" => $token['to'],
"id" => $token['token_id']
]];
}

/**
* Cache TOEKN_ID => [ANNOTATIONS_IDS]
*/
$annotations_token_id = [];
foreach ($annotations as $annotation) {
$annotation_from = $annotation['from'];
$annotation_to = $annotation['to'];
$start = true;
foreach ($cache_tokens_from as $from => $value) {
$to = $value['to'];
$id = $value['id'];
if ($from >= $annotation_from and $to <= $annotation_to) {
$iob = $start ? "B-" : "I-";
$start = false;
if (array_key_exists($id, $annotations_token_id)) {
$annotations_token_id[$id][] = array(
"ann_id" => $annotation['id'],
"iob" => $iob
);
} else {
$annotations_token_id += [
$id => [array(
"ann_id" => $annotation['id'],
"iob" => $iob
)]
];
}
}
}
}

/**
* Cache ANNOTATION_ID => [RELATIONS]
*/
$relations_cache = [];
foreach ($relations as $relation) {
$ids = array_unique([$relation['source_id'], $relation['target_id']]);
foreach ($ids as $id) {
if (array_key_exists($id, $relations_cache)) {
$relations_cache[$id][] = $relation;
} else {
$relations_cache += [
$id => [$relation]
];
}
}
}

/**
* ORDER_ID - order of the token in the document
* TOKEN_ID - order of the token in the sentence
* ORHT - orth of the token
* CTAG - first ctag from token
* FROM - offset start in text for the token
* TO - offset end in text for the token
* ANN_TAGS - Array of Annotations given for the token, joined by ":"
* ANN_IDS - Array of Annotation IDs for the token, preserving order ANN_TAGS, joined by ":"
* REL_IDS - Array of Relations IDs for the token, joined by ":"
* REL_TARGET_ANN_IDS - Array of references for target Annotation ID for relation, "_" otherwise if an annotation is a target, preserving order REL_IDS, joined by ":"
*/
$conll = "";
$conll .= "ORDER_ID\tTOKEN_ID\tORTH\tCTAG\tFROM\tTO\tANN_TAGS\tANN_IDS\tREL_IDS\tREL_TARGET_ANN_IDS\n";
$json_builder = [
"chunks" => [],
"relations" => array_values($relations),
"annotations" => $annotations,
];
$it = 0;
foreach ($ccl->chunks as $chunk) {
$json_sentences = [];
foreach ($chunk->sentences as $sentence) {
$json_sentence = [];
$id = 0;
foreach ($sentence->tokens as $token) {
$original_id = $tokens_ids[$it++];
$ann_tag = [];
$ann_id = [];
$rel_id = [];
$rel_target_id = [];
if (array_key_exists($original_id, $annotations_token_id)) {
$annotations_for_id = $annotations_token_id[$original_id];
foreach ($annotations_for_id as $annotations_from_cache) {
$annotations_from_cache_id = $annotations_from_cache["ann_id"];
$iob = $annotations_from_cache["iob"];

$annotation = $annotations_by_id[$annotations_from_cache_id];
$ann_tag[] = $iob . $annotation['name'];
$ann_id[] = $annotation['id'];

if (array_key_exists($annotation['id'], $relations_cache)) {
$relations_for_token = $relations_cache[$annotation['id']];
foreach ($relations_for_token as $relation) {
$rel_id[] = $relation['id'];
if ($relation['source_id'] == $annotation['id']) {
$rel_target_id[] = $relation['target_id'];
}
}
}
}
}
$token_id = $id++;
$json_sentence[] = array(
"order_id" => $token->id,
"token_id" => $token_id,
"orth" => $token->orth,
"ctag" => $token->lexemes[0]->ctag,
"from" => $token->from,
"to" => $token->to,
"annotations" => $ann_id,
"relations" => $rel_id
);

if (empty($ann_tag)) {
$ann_tag += ["O"];
}

$arrays_to_check = [&$ann_id, &$rel_id, &$rel_target_id];
foreach ($arrays_to_check as &$array_to_check) {
if (empty($array_to_check)) {
$array_to_check += ["_"];
}
}
$conll .= $token->id . "\t" . $token_id . "\t" . $token->orth . "\t" . $token->lexemes[0]->ctag . "\t" . $token->from . "\t" .
$token->to . "\t" . join(":", $ann_tag) . "\t" . join(":", $ann_id) . "\t" .
join(":", $rel_id) . "\t" . join(":", $rel_target_id) . "\n";

}
$conll .= "\n";
$json_sentences[] = $json_sentence;
}
$json_builder["chunks"][] = $json_sentences;
}

$handle = fopen($file_path_without_ext . ".conll", "w");
fwrite($handle, $conll);
fclose($handle);

$handle = fopen($file_path_without_ext . ".json", "w");
fwrite($handle, json_encode($json_builder, JSON_PRETTY_PRINT + JSON_UNESCAPED_UNICODE));
fclose($handle);
}

}
29 changes: 17 additions & 12 deletions engine/include/export/CorpusExporter.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

/**
* Klasa służy do eksportu wybranych dokumentów i elementów korpusu do wybranego formatu.
*
*
* Obecnie obsługiwany jest format CCL.
*
*
* @author czuk
*
*/
Expand Down Expand Up @@ -46,24 +46,24 @@ function parse_extractor($description){
}
$flag = $parts[0];
$elements = $parts[1];

$flag = split("=", $flag);
if ( count($flag) !== 2 ){
throw new Exception("Niepoprawny opis ekstraktora " . $description .": definicja flagi");
}

$flag_name = strtolower($flag[0]);
$flag_ids = explode(",", $flag[1]);

foreach ( explode("&", $elements) as $element ){
$parts = explode("=", $element);
$element_name = $parts[0];
$extractor_name = $flag_name."=".implode(",", $flag_ids).":".$element;
$extractor = array("flag_name"=>$flag_name, "flag_ids"=>$flag_ids, "name"=>$extractor_name);

/* Esktraktor anotacji po identyfikatorze zbioru anotacji */
if ( $element_name === "annotation_set_id" ){

$extractor["params"] = explode(",", $parts[1]);
$extractor["extractor"] = function($report_id, $params, &$elements){
// $params -- set of annotation_set_id
Expand Down Expand Up @@ -382,7 +382,7 @@ function getReportTagsByTokens($report_id, $tokens_ids, $disamb_only=true, $tagg
function export_document($report_id, &$extractors, $disamb_only, &$extractor_stats, &$lists, $output_folder, $subcorpora, $tagging_method){
$flags = DbReportFlag::getReportFlags($report_id);
$elements = array("annotations"=>array(), "relations"=>array(), "lemmas"=>array(), "attributes"=>array());

// Wykonaj esktraktor w zależności od ustalonej flagi
foreach ( $extractors as $extractor ){
$func = $extractor["extractor"];
Expand Down Expand Up @@ -449,7 +449,7 @@ function export_document($report_id, &$extractors, $disamb_only, &$extractor_sta
if ( isset($elements["attributes"]) && count($elements["attributes"]) ){
$attributes = $elements["attributes"];
}

/* Usunięcie zduplikowanych anotacji */
$annotations_by_id = array();
foreach ($annotations as $an){
Expand Down Expand Up @@ -501,9 +501,14 @@ function export_document($report_id, &$extractors, $disamb_only, &$extractor_sta
}
}

/* Wygeneruj xml i rel.xml */
CclFactory::setAnnotationsAndRelations($ccl, $annotations, $relations);
CclFactory::setAnnotationLemmas($ccl, $lemmas);
$file_path_without_ext = $output_folder . "/" . $ccl->getFileName();

/* Wygeneruj CONLL i JSON */
ConllAndJsonFactory::exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id);

/* Wygeneruj xml i rel.xml */
CclFactory::setAnnotationsAndRelations($ccl, $annotations, $relations);
CclFactory::setAnnotationLemmas($ccl, $lemmas);
CclFactory::setAnnotationProperties($ccl, $attributes);
CclWriter::write($ccl, $output_folder . "/" . $ccl->getFileName() . ".xml", CclWriter::$CCL);
CclWriter::write($ccl, $output_folder . "/" . $ccl->getFileName() . ".rel.xml", CclWriter::$REL);
Expand Down
17 changes: 9 additions & 8 deletions local/daemon-export.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
include($enginePath . "/include.php");
include($enginePath . "/cliopt.php");
include($enginePath . "/clioptcommon.php");
ini_set('memory_limit', '-1');

mb_internal_encoding("utf-8");
ob_end_clean();
Expand Down Expand Up @@ -49,7 +50,7 @@
} else
throw new Exception(
"DB URI is incorrect. Given '$uri', but exptected" .
" 'user:pass@host:port/name'");
" 'user:pass@host:port/name'");
$config->dsn['phptype'] = 'mysql';
$config->dsn['username'] = $dbUser;
$config->dsn['password'] = $dbPass;
Expand Down Expand Up @@ -81,10 +82,10 @@ class TaskExport{
function __construct($config){
$this->db = new Database($config->dsn, false);
$GLOBALS['db'] = $this->db;

$this->verbose = $config->verbose;
$this->path_exports = $config->path_exports;

if ( !file_exists($this->path_exports) ){
mkdir($this->path_exports, 0777, true);
}
Expand Down Expand Up @@ -121,13 +122,13 @@ function tick(){
array("export_id"=>$task['export_id']));
}
$this->db->mdb2->query("COMMIT");

$selectors = array_filter(explode("\n",trim($task['selectors'])));
$extractors = array_filter(explode("\n",trim($task['extractors'])));
$indices = array_filter(explode("\n",trim($task['indices'])));

$result = $this->process($task['export_id'], $task['corpus_id'], $selectors, $extractors, $indices, $task['tagging']);

$message = "Eksport zakończony";
$status = "done";

Expand All @@ -146,16 +147,16 @@ function tick(){
* @param $tagging String tagging method from ['tagger', 'final', 'final_or_tagger', 'user:{id}']
*/
function process($task_id, $corpus_id, $selectors, $extractors, $indices, $tagging){

$output_folder = "/tmp/inforex_export_{$task_id}";
$exporter = new CorpusExporter();
$exporter->exportToCcl($output_folder, $selectors, $extractors, $indices, $task_id, $tagging);
echo "packing...\n";

shell_exec("7z a {$output_folder}.7z $output_folder");
shell_exec("mv {$output_folder}.7z {$this->path_exports}");
echo "finished.\n\n";

return true;
}
}
Expand Down