From fc3a0ee295c383fabad7d2da34ffc6b37a344896 Mon Sep 17 00:00:00 2001 From: Marc Kupietz Date: Fri, 5 Jul 2024 16:58:16 +0200 Subject: [PATCH] Add --auto-textsigle option Also allows for processing plain TEI P5 files without any IDs. Change-Id: Ic16b089c916d2e50458aa1aa6cb80ce4d37d97ba --- Changes | 1 + Readme.pod | 11 +++++++++++ lib/KorAP/XML/TEI.pm | 14 +++++++++++++- script/tei2korapxml | 20 ++++++++++++++++++-- 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/Changes b/Changes index 191d067..a487f61 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,7 @@ 2.6.0 2024-09-19 - Add -o parameter. - Add support for inline dependency relations. + - Add support for --auto-textsigle. 2.5.0 2024-01-24 - Upgrade minimal Perl version to 5.36 to improve diff --git a/Readme.pod b/Readme.pod index 1c95540..e890733 100644 --- a/Readme.pod +++ b/Readme.pod @@ -165,6 +165,17 @@ C<--no-skip-inline-token-annotations>. Expects a comma-separated list of tags to be ignored when the structure is parsed. Content of these tags however will be processed. +=item B<--auto-textsigle> + +Expects a text sigle thats serves as fallback if no text sigles +are given in the input data. +The auto text sigle will be incremented for each text processed. + +Example: + + tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \ + < data.i5.xml > korapxml.zip + =item B<--xmlid-to-textsigle> @ Expects a regular replacement expression (separated by B<@> between the diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm index 1111c8b..f7768e7 100644 --- a/lib/KorAP/XML/TEI.pm +++ b/lib/KorAP/XML/TEI.pm @@ -4,7 +4,7 @@ use strict; use warnings; use Exporter 'import'; -our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities); +our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities increase_auto_textsigle); # convert '&', '<' and '>' into their corresponding sgml-entities my %ent_without_quot = ( @@ -180,4 +180,16 @@ sub replace_entities { return($_); }; +sub increase_auto_textsigle { + my $sigle = shift; + + if ($sigle =~ /(\d+)$/) { + my $number = $1; + my $length = length($number); + $number++; + my $new_number = sprintf("%0${length}d", $number); + $sigle =~ s/\d+$/$new_number/; + } + return $sigle; +} 1; diff --git a/script/tei2korapxml b/script/tei2korapxml index 418408e..86f7527 100755 --- a/script/tei2korapxml +++ b/script/tei2korapxml @@ -6,6 +6,7 @@ use Log::Any '$log'; use Log::Any::Adapter; use Pod::Usage; use Getopt::Long qw(GetOptions :config no_auto_abbrev); +use KorAP::XML::TEI qw(increase_auto_textsigle); use File::Basename qw(dirname); @@ -45,6 +46,7 @@ my $inline_deps_exclusive = 0; # Parse options from the command line GetOptions( + 'auto-textsigle|A=s' => \(my $auto_textsigle = ''), 'root|r=s' => \(my $root_dir = '.'), 'input|i=s' => \(my $input_fname = ''), 'output|o=s' => \(my $output_fname = ''), @@ -460,8 +462,11 @@ MAIN: while (<$input_fh>) { }; # Parse header - my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh); - + my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh); + if ($auto_textsigle) { + $auto_textsigle = increase_auto_textsigle($auto_textsigle); + $log->debug("Auto-incremented text sigle to $auto_textsigle"); + }; # Header was parseable if ($header) { @@ -666,6 +671,17 @@ C<--no-skip-inline-token-annotations>. Expects a comma-separated list of tags to be ignored when the structure is parsed. Content of these tags however will be processed. +=item B<--auto-textsigle> + +Expects a text sigle thats serves as fallback if no text sigles +are given in the input data. +The auto text sigle will be incremented for each text processed. + +Example: + + tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \ + < data.i5.xml > korapxml.zip + =item B<--xmlid-to-textsigle> @ Expects a regular replacement expression (separated by B<@> between the