Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: new [taxonomy].extended.json with extended synonyms #10744

Merged
merged 2 commits into from
Aug 28, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 32 additions & 21 deletions lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1098,8 +1098,7 @@ sub get_file_from_cache ($source, $target) {
# e.g. if the taxonomy building algorithm or configuration has changed
# This needs to be done also when the unaccenting parameters for languages set in Config.pm are changed

my $BUILD_TAGS_VERSION
= "20240403 - fix issue with additives.properties.txt not loaded + circular_parent check + moved canonicalization of properties to linter";
my $BUILD_TAGS_VERSION = "20240828 - new [tagtype].extended.json format with normalized extended synonyms";

sub get_from_cache ($tagtype, @files) {
# If the full set of cached files can't be found then returns the hash to be used
Expand Down Expand Up @@ -2092,6 +2091,7 @@ sub build_tags_taxonomy ($tagtype, $publish) {
# data structure to export the taxonomy to json format
my %taxonomy_json = ();
my %taxonomy_full_json = (); # including wikipedia abstracts
my %taxonomy_extended_json = (); # with extended synonyms

foreach my $lc (sort keys %{$stopwords{$tagtype}}) {
next if $lc =~ /\./; # .orig or .strings
Expand All @@ -2107,6 +2107,7 @@ sub build_tags_taxonomy ($tagtype, $publish) {

$taxonomy_json{$tagid} = {name => {}};
$taxonomy_full_json{$tagid} = {name => {}};
$taxonomy_extended_json{$tagid} = {name => {}};

# print "taxonomy - compute all children - $tagid - level: $level{$tagtype}{$tagid} - longest: $longest_parent{$tagid} - syn: $just_synonyms{$tagtype}{$tagid} - sort_key: $sort_key_parents{$tagid} \n";
if (defined $direct_parents{$tagtype}{$tagid}) {
Expand Down Expand Up @@ -2160,6 +2161,7 @@ sub build_tags_taxonomy ($tagtype, $publish) {

$taxonomy_json{$tagid}{name}{$lc} = $translations_to{$tagtype}{$tagid}{$lc};
$taxonomy_full_json{$tagid}{name}{$lc} = $translations_to{$tagtype}{$tagid}{$lc};
$taxonomy_extended_json{$tagid}{name}{$lc} = $translations_to{$tagtype}{$tagid}{$lc};

my $lc_tagid = get_string_id_for_lang($lc, $translations_to{$tagtype}{$tagid}{$lc});

Expand All @@ -2175,12 +2177,24 @@ sub build_tags_taxonomy ($tagtype, $publish) {
{
$taxonomy_json{$tagid}{name}{$lc} .= " - " . $synonyms_for{$tagtype}{$lc}{$lc_tagid}[1];
$taxonomy_full_json{$tagid}{name}{$lc} .= " - " . $synonyms_for{$tagtype}{$lc}{$lc_tagid}[1];
$taxonomy_extended_json{$tagid}{name}{$lc}
.= " - " . $synonyms_for{$tagtype}{$lc}{$lc_tagid}[1];
}

# add synonyms to the full taxonomy
if (defined $synonyms_for{$tagtype}{$lc}{$lc_tagid}) {
(defined $taxonomy_full_json{$tagid}{synonyms}) or $taxonomy_full_json{$tagid}{synonyms} = {};
$taxonomy_full_json{$tagid}{synonyms}{$lc} = $synonyms_for{$tagtype}{$lc}{$lc_tagid};
$taxonomy_extended_json{$tagid}{normalized_synonyms}{$lc}
= [map {get_string_id_for_lang($lc, $_)} @{$synonyms_for{$tagtype}{$lc}{$lc_tagid}}];
}

# add extended synonyms to the extended taxonomy
if (defined $synonyms_for_extended{$tagtype}{$lc}{$lc_tagid}) {
(defined $taxonomy_extended_json{$tagid}{synonyms_extended})
or $taxonomy_extended_json{$tagid}{synonyms_extended} = {};
$taxonomy_extended_json{$tagid}{normalized_synonyms_extended}{$lc}
= [sort keys %{$synonyms_for_extended{$tagtype}{$lc}{$lc_tagid}}];
}
}
}
Expand Down Expand Up @@ -2235,27 +2249,24 @@ sub build_tags_taxonomy ($tagtype, $publish) {

{
binmode STDOUT, ":encoding(UTF-8)";
if (open(my $OUT_JSON, ">", "$BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json")) {
print $OUT_JSON encode_json(\%taxonomy_json);
close($OUT_JSON);
}
else {
print
"Cannot open $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json, skipping writing taxonomy to file.\n";
}

if (open(my $OUT_JSON_FULL, ">", "$BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.full.json")) {
print $OUT_JSON_FULL encode_json(\%taxonomy_full_json);
close($OUT_JSON_FULL);
}
else {
print
"Cannot open $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.full.json, skipping writing taxonomy to file.\n";
foreach my $extension_taxonomy_ref (
["", \%taxonomy_json],
[".full", \%taxonomy_full_json],
[".extended", \%taxonomy_extended_json]
)
{
my ($extension, $taxonomy_ref) = @$extension_taxonomy_ref;
# Output the taxonomy in JSON format
if (open(my $OUT_JSON, ">", "$BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype$extension.json")) {
print $OUT_JSON encode_json($taxonomy_ref);
close($OUT_JSON);
}
else {
print
"Cannot open $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype$extension.json, skipping writing taxonomy to file.\n";
}
}
# to serve pre-compressed files from Apache
# nginx : needs nginx_static module
# system("cp $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json.json");
# system("gzip $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json");
}

$log->error("taxonomy errors", {errors => \@taxonomy_errors}) if $log->is_error();
Expand Down
Loading