Skip to content

Commit

Permalink
scripts for generating hungarian charset table for Sphinxsearch.com
Browse files Browse the repository at this point in the history
  • Loading branch information
bieli committed Aug 13, 2013
1 parent d7fbc9a commit feb0b06
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
2 changes: 2 additions & 0 deletions scripts/sphinx-charset-generator.hungarian.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
charset_table=a..z,0..9,A..Z->a..z, \
U+C0->a, U+C1->a, U+C2->a, U+C3->a, U+C7->c, U+C8->e, U+C9->e, U+CA->e, U+CC->i, U+CD->i, U+CE->i, U+D2->o, U+D3->o, U+D4->o, U+D5->o, U+D9->u, U+DA->u, U+DB->u, U+E0->a, U+E1->a, U+E2->a, U+E3->a, U+E7->c, U+E8->e, U+E9->e, U+EA->e, U+EC->i, U+ED->i, U+EE->i, U+F2->o, U+F3->o, U+F4->o, U+F5->o, U+F9->u, U+FA->u, U+FB->u, U+102->a, U+103->a, U+15E->s, U+15F->s, U+162->t, U+163->t
33 changes: 33 additions & 0 deletions scripts/sphinx-charset-generator.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env perl
#
# Generates a charset_table entry for Sphinx
#

use strict;
use warnings;
use utf8;
use Text::Unaccent;
use Encode;

my $map = gen_map_accents($ARGV[0] || 'utf-8');

print "charset_table=a..z,0..9,A..Z->a..z, \\\n";
print join(', ', map { sprintf('U+%X->%s', unpack('W*'), $map->{$_}) } sort keys %$map);
print "\n";

sub gen_map_accents {
my $charset = shift;
my %map;

# my $src = 'áéíóúàèìòùãõâêîôûç';
my $src = 'áéíóúàèìòùãõâêîôûçĂÂÎŢŞăâîţş';
$src .= uc($src);
$src = encode($charset, $src) unless $charset =~ m/^utf-?8$/i;

my $clean = lc(unac_string($charset, $src));
@map{split(//, $src)} = split(//, $clean);

return \%map;
}


0 comments on commit feb0b06

Please sign in to comment.