Skip to content

Commit

Permalink
add function to extract basic statistical info from ALTO xml (number …
Browse files Browse the repository at this point in the history
…of textlines, words, glyphs, illustrations and graphical elements)
  • Loading branch information
cneud committed Oct 13, 2023
1 parent 0e61377 commit f417d5d
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 1 deletion.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@ alto-tools <INPUT> [OPTION]

`INPUT` should be the path to an ALTO xml file or directory containing ALTO xml files.

Output is sent to `stdout`.
The following `OPTIONS` are currently supported:

| OPTION | Description |
|------------------------|:------------------------------------------------------------------|
| `-t` `--text` | Extract UTF-8 encoded text content |
| `-c` `--confidence` | Extract mean OCR word confidence score |
| `-i` `--illustrations` | Extract bounding box coordinates of `<Illustration>` elements |
| `-g` `--graphics` | Extract bounding box coordinates of `<GraphicalElement>` elements |
| `-s` `--statistics` | Extract statistical info (no. of textlines, words, glyphs etc.) |

All output is sent to `stdout`.
40 changes: 40 additions & 0 deletions src/alto_tools/alto_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,36 @@ def alto_confidence(alto, xml, xmlns):
return 0


def alto_statistics(alto, xml, xmlns):
"""Extract statistical information from ALTO xml file"""
no_textlines = 0
no_strings = 0
no_glyphs = 0
no_illustrations = 0
no_graphics = 0
for textlines in xml.iterfind(".//{%s}TextLine" % xmlns):
no_textlines += 1
for strings in xml.iterfind(".//{%s}String" % xmlns):
no_strings += 1
for glyphs in xml.iterfind(".//{%s}Glyph" % xmlns):
no_glyphs += 1
for illustrations in xml.iterfind(".//{%s}Illustration" % xmlns):
no_illustrations += 1
for graphics in xml.iterfind(".//{%s}GraphicalElement" % xmlns):
no_graphics += 1
sys.stdout.write(f"\nFile: {alto.name}, Statistics:")
sys.stdout.write(f"\n# of <TextLine> elements: {no_textlines}")
sys.stdout.write(f"\n# of <String> elements: {no_strings}")
sys.stdout.write(f"\n# of <Glyph> elements: {no_glyphs}")
sys.stdout.write(f"\n# of <Illustration> elements: {no_illustrations}")
sys.stdout.write(f"\n# of <GraphicalElement> elements: {no_graphics}")
return no_textlines
return no_strings
return no_glyphs
return no_illustrations
return no_graphics


def parse_arguments():
parser = argparse.ArgumentParser(
description="ALTO Tools: simple tools for performing various operations on ALTO xml files",
Expand Down Expand Up @@ -197,6 +227,14 @@ def parse_arguments():
dest="graphics",
help="extract bounding boxes of graphical elements",
)
g.add_argument(
"-s",
"--statistics",
action="store_true",
default=False,
dest="statistics",
help="extract statistical information",
)
parser.add_argument(
"-x",
"--xml-encoding",
Expand Down Expand Up @@ -276,6 +314,8 @@ def main():
alto_illustrations(alto, xml, xmlns)
if args.graphics:
alto_graphics(alto, xml, xmlns)
if args.statistics:
alto_statistics(alto, xml, xmlns)
number_of_files = len(list(walker(args.INPUT, fnfilter)))
if number_of_files >= 2:
if args.confidence:
Expand Down

0 comments on commit f417d5d

Please sign in to comment.