Skip to content
This repository was archived by the owner on Sep 10, 2022. It is now read-only.

Commit 0d2fe13

Browse files
author
Richard Sproat
committed
Update to Sparrowhawk 1.0, with new serialization.
1 parent eb97411 commit 0d2fe13

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+3368
-76
lines changed

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,8 @@ Sparrowhawk - Release 0.1
22

33
This is the alpha version.
44

5+
Sparrowhawk - Release 1.0
6+
7+
* Added new verbalizer serialization, with accompanying grammars.
8+
9+

README

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Sparrowhawk - Release 0.1
1+
Sparrowhawk - Release 1.0
22

33
Sparrowhawk is an open-source implementation of Google's Kestrel text-to-speech
44
text normalization system. It follows the discussion of the Kestrel system as
@@ -34,6 +34,11 @@ INSTALLATION:
3434
recommend configuring with --enable-static=no for faster
3535
compiles.
3636

37+
NOTE: In some versions of Mac OS-X we have noticed a problem with configure
38+
whereby it fails to find fst.h. If this occurs, try configuring as follows:
39+
40+
CPPFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./configure
41+
3742
USAGE:
3843
Assuming you've installed under the default /usr/local, the library will be
3944
in /usr/local/lib, and the headers in /usr/local/include/sparrowhawk.

configure

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#! /bin/sh
22
# Guess values for system-dependent variables and create Makefiles.
3-
# Generated by GNU Autoconf 2.69 for Sparrowhawk 0.1.0.
3+
# Generated by GNU Autoconf 2.69 for Sparrowhawk 1.0.0.
44
#
55
# Report bugs to <rws@google.com>.
66
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
590590
# Identity of this package.
591591
PACKAGE_NAME='Sparrowhawk'
592592
PACKAGE_TARNAME='sparrowhawk'
593-
PACKAGE_VERSION='0.1.0'
594-
PACKAGE_STRING='Sparrowhawk 0.1.0'
593+
PACKAGE_VERSION='1.0.0'
594+
PACKAGE_STRING='Sparrowhawk 1.0.0'
595595
PACKAGE_BUGREPORT='rws@google.com'
596596
PACKAGE_URL=''
597597

@@ -1325,7 +1325,7 @@ if test "$ac_init_help" = "long"; then
13251325
# Omit some internal or obsolete options to make the list less imposing.
13261326
# This message is too long to be a string in the A/UX 3.1 sh.
13271327
cat <<_ACEOF
1328-
\`configure' configures Sparrowhawk 0.1.0 to adapt to many kinds of systems.
1328+
\`configure' configures Sparrowhawk 1.0.0 to adapt to many kinds of systems.
13291329

13301330
Usage: $0 [OPTION]... [VAR=VALUE]...
13311331

@@ -1395,7 +1395,7 @@ fi
13951395

13961396
if test -n "$ac_init_help"; then
13971397
case $ac_init_help in
1398-
short | recursive ) echo "Configuration of Sparrowhawk 0.1.0:";;
1398+
short | recursive ) echo "Configuration of Sparrowhawk 1.0.0:";;
13991399
esac
14001400
cat <<\_ACEOF
14011401

@@ -1504,7 +1504,7 @@ fi
15041504
test -n "$ac_init_help" && exit $ac_status
15051505
if $ac_init_version; then
15061506
cat <<\_ACEOF
1507-
Sparrowhawk configure 0.1.0
1507+
Sparrowhawk configure 1.0.0
15081508
generated by GNU Autoconf 2.69
15091509

15101510
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1994,7 +1994,7 @@ cat >config.log <<_ACEOF
19941994
This file contains any messages produced by compilers while
19951995
running configure, to aid debugging if configure makes a mistake.
19961996

1997-
It was created by Sparrowhawk $as_me 0.1.0, which was
1997+
It was created by Sparrowhawk $as_me 1.0.0, which was
19981998
generated by GNU Autoconf 2.69. Invocation command line was
19991999

20002000
$ $0 $@
@@ -2857,7 +2857,7 @@ fi
28572857

28582858
# Define the identity of the package.
28592859
PACKAGE='sparrowhawk'
2860-
VERSION='0.1.0'
2860+
VERSION='1.0.0'
28612861

28622862

28632863
cat >>confdefs.h <<_ACEOF
@@ -4162,6 +4162,7 @@ unknown)
41624162
esac
41634163

41644164

4165+
CPPFLAGS="$CPPFLAGS -funsigned-char"
41654166
CXXFLAGS="$CXXFLAGS -std=c++11"
41664167

41674168
ac_ext=cpp
@@ -16052,7 +16053,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
1605216053
# report actual input values of CONFIG_FILES etc. instead of their
1605316054
# values after options handling.
1605416055
ac_log="
16055-
This file was extended by Sparrowhawk $as_me 0.1.0, which was
16056+
This file was extended by Sparrowhawk $as_me 1.0.0, which was
1605616057
generated by GNU Autoconf 2.69. Invocation command line was
1605716058

1605816059
CONFIG_FILES = $CONFIG_FILES
@@ -16109,7 +16110,7 @@ _ACEOF
1610916110
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
1611016111
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
1611116112
ac_cs_version="\\
16112-
Sparrowhawk config.status 0.1.0
16113+
Sparrowhawk config.status 1.0.0
1611316114
configured by $0, generated by GNU Autoconf 2.69,
1611416115
with options \\"\$ac_cs_config\\"
1611516116

configure.ac

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
AC_INIT([Sparrowhawk], [0.1.0], [rws@google.com])
1+
AC_INIT([Sparrowhawk], [1.0.0], [rws@google.com])
22
AM_INIT_AUTOMAKE([foreign nostdinc -Wall -Werror])
33

44
AM_PROG_AR
55

6+
CPPFLAGS="$CPPFLAGS -funsigned-char"
67
CXXFLAGS="$CXXFLAGS -std=c++11"
78

89
AC_PROG_CXX

documentation/README.md

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,77 @@ the token as a sequence of characters:
357357
3_character :_character 3_character 0_character
358358
</pre>
359359

360+
### Verbalizer grammars: new serialization format (Sparrowhawk 1.0 and above)
361+
362+
With Sparrowhawk 1.0, we introduce a simpler format for verbalizer
363+
grammars. The upside of this is that it makes writing the verbalizer grammars
364+
quite a bit simpler. The downside is that it requires a serialization
365+
specification proto instance (see below). This new format has no relevance to
366+
the classifier grammars, which should be written as described above in any case.
367+
368+
The main salient differences between the previous format and the new
369+
serialization format are first that the representation that is passed by the
370+
serialization to the verbalizer is more compact. Instead of
371+
372+
<pre>
373+
money { amount { integer_part: "3" } currency: "usd" }
374+
</pre>
375+
376+
what gets passed is
377+
378+
<pre>
379+
money|integer_part:3|currency:usd|
380+
</pre>
381+
382+
For both major and minor currencies the verbalizer sees, e.g.:
383+
384+
<pre>
385+
money|integer_part:3|currency:usd|fractional_part:50|currency:usd|
386+
</pre>
387+
388+
The second major difference is that a REDUP rule is no longer needed. Rather the
389+
serialization, and possible copying of elements is done in code, controlled by
390+
the serialization specification, itself an ASCII protocol buffer representation
391+
that is referenced by an additional optional specification in the Sparrowhawk
392+
configuration file. An example is given in
393+
"verbalizer&#x005f;serialization&#x005f;spec.ascii&#x005f;proto". This specifies
394+
the serialization possibilities for the different classes. For money, the
395+
specification:
396+
397+
<pre>
398+
class_spec {
399+
semiotic_class: "money"
400+
style_spec {
401+
record_spec {
402+
field_path: "money.amount.integer_part"
403+
suffix_spec {
404+
field_path: "money.currency"
405+
}
406+
}
407+
record_spec {
408+
field_path: "money.amount.fractional_part"
409+
suffix_spec {
410+
field_path: "money.currency"
411+
}
412+
}
413+
}
414+
}
415+
</pre>
416+
417+
means that the integer part of the money expression and the fractional part are
418+
verbalized in that order, and the repetition of the "money.currency" field has
419+
the effect of duplicating the expression for the currency itself. Again, the
420+
verbalizer grammar is responsible for determining that the first instance would
421+
be read as the major currency expression, and the second as the minor currency
422+
expression.
423+
424+
The protocol buffer definition of the serialization specification is found in
425+
"src/proto/serialization&#x005f;spec.proto", which is also documented with
426+
comments on the functions of the various fields.
427+
428+
The parallel English toy grammar in the new serializer format can be found in
429+
"grammars/en&#x005f;toy/verbalize&#x005f;serialization".
430+
360431
### Sentence boundary detection
361432

362433
Sparrowhawk provides some simple support for sentence boundary detection. One
@@ -454,6 +525,12 @@ For example in the "grammars" directory, assuming one has built all the grammars
454525
normalizer_main --config=sparrowhawk_configuration.ascii_proto --multi_line_text < test.txt 2>/dev/null
455526
</pre>
456527

528+
For the new serialization specification, the invocation is as follows:
529+
530+
<pre>
531+
normalizer_main --config=sparrowhawk_configuration_serialization.ascii_proto --multi_line_text < test.txt 2>/dev/null
532+
</pre>
533+
457534
Integrating Sparrowhawk with Festival
458535
-------------------------
459536

@@ -509,7 +586,7 @@ festival/examples/sparrowhawk_test_us_null.scm
509586
Sparrowhawk will perform tokenization and text normalization and leave you with
510587
a sequence of words in Festival's 'Word' relation. You need to take it from
511588
there.
512-
589+
513590
How to cite Sparrowhawk
514591
-------------------------
515592

13.8 KB
Binary file not shown.
306 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import '../byte.grm' as b;
2+
import '../util.grm' as u;
3+
import 'numbers.grm' as n;
4+
5+
# quotation mark
6+
q = u.q;
7+
8+
# Used to allow for different numbers of spaces coming out of the serializer.
9+
s = u.s;
10+
11+
month = b.kAlpha+;
12+
13+
day = n.ORDINAL;
14+
15+
d = b.kDigit;
16+
D = b.kDigit - "0";
17+
18+
two_digit =
19+
((D d) @ n.CARDINAL)
20+
| ("0" : "oh ") (D @ n.CARDINAL)
21+
| ("00" : "hundred")
22+
;
23+
24+
# Years are not read as cardinals, generally:
25+
year =
26+
(("19" @ n.CARDINAL) u.I[" "] two_digit)
27+
| (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit))
28+
| (("200" d) @ n.CARDINAL)
29+
;
30+
31+
# Remove these if they occur
32+
33+
field = (b.kAlpha | "_")+;
34+
preserve_order = "preserve_order:true";
35+
field_order = "field_order:" field;
36+
field_order_specs = (preserve_order | field_order)*;
37+
38+
# Verbalization for MDY
39+
mdy =
40+
u.D["date"]
41+
u.D["|month:"]
42+
month
43+
u.I[" the "]
44+
u.D["|day:"]
45+
day
46+
u.I[" "]
47+
u.D["|year:"]
48+
year
49+
u.D[field_order_specs]?
50+
u.D["|"]
51+
;
52+
53+
# Verbalization for DMY
54+
dmy =
55+
u.D["date"]
56+
u.I["the "]
57+
u.D["|day:"]
58+
day
59+
u.I[" of "]
60+
u.D["|month:"]
61+
month
62+
u.D["|year:"]
63+
u.I[" "]
64+
year
65+
u.D[field_order_specs]?
66+
u.D["|"]
67+
;
68+
69+
export DATE = Optimize[mdy | dmy];

0 commit comments

Comments
 (0)