From feb0b06ddca8a0316b557d217f0451755294dbe7 Mon Sep 17 00:00:00 2001 From: Marcin Bielak Date: Tue, 13 Aug 2013 19:56:58 +0200 Subject: [PATCH] scripts for generating hungarian charset table for Sphinxsearch.com --- .../sphinx-charset-generator.hungarian.txt | 2 ++ scripts/sphinx-charset-generator.pl | 33 +++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 scripts/sphinx-charset-generator.hungarian.txt create mode 100644 scripts/sphinx-charset-generator.pl diff --git a/scripts/sphinx-charset-generator.hungarian.txt b/scripts/sphinx-charset-generator.hungarian.txt new file mode 100644 index 0000000..4fc7c61 --- /dev/null +++ b/scripts/sphinx-charset-generator.hungarian.txt @@ -0,0 +1,2 @@ +charset_table=a..z,0..9,A..Z->a..z, \ +U+C0->a, U+C1->a, U+C2->a, U+C3->a, U+C7->c, U+C8->e, U+C9->e, U+CA->e, U+CC->i, U+CD->i, U+CE->i, U+D2->o, U+D3->o, U+D4->o, U+D5->o, U+D9->u, U+DA->u, U+DB->u, U+E0->a, U+E1->a, U+E2->a, U+E3->a, U+E7->c, U+E8->e, U+E9->e, U+EA->e, U+EC->i, U+ED->i, U+EE->i, U+F2->o, U+F3->o, U+F4->o, U+F5->o, U+F9->u, U+FA->u, U+FB->u, U+102->a, U+103->a, U+15E->s, U+15F->s, U+162->t, U+163->t diff --git a/scripts/sphinx-charset-generator.pl b/scripts/sphinx-charset-generator.pl new file mode 100644 index 0000000..61a8f11 --- /dev/null +++ b/scripts/sphinx-charset-generator.pl @@ -0,0 +1,33 @@ +#!/usr/bin/env perl +# +# Generates a charset_table entry for Sphinx +# + +use strict; +use warnings; +use utf8; +use Text::Unaccent; +use Encode; + +my $map = gen_map_accents($ARGV[0] || 'utf-8'); + +print "charset_table=a..z,0..9,A..Z->a..z, \\\n"; +print join(', ', map { sprintf('U+%X->%s', unpack('W*'), $map->{$_}) } sort keys %$map); +print "\n"; + +sub gen_map_accents { + my $charset = shift; + my %map; + +# my $src = 'áéíóúàèìòùãõâêîôûç'; + my $src = 'áéíóúàèìòùãõâêîôûçĂÂÎŢŞăâîţş'; + $src .= uc($src); + $src = encode($charset, $src) unless $charset =~ m/^utf-?8$/i; + + my $clean = lc(unac_string($charset, $src)); + @map{split(//, $src)} = split(//, $clean); + + return \%map; +} + +