Text is searched recursively in all tags embeded in spag tags

c-mauderer · Jan 21, 2015 · 6cb02f6 · 6cb02f6
1 parent f6b9b6a
commit 6cb02f6
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 4 deletions.
diff --git a/HocrConverter.py b/HocrConverter.py
@@ -421,10 +421,23 @@ def to_pdf(self, imageFileNames, outFileName, fontname="Courier", fontsize=12, w
                 textContent = unicodedata.normalize("NFC",unicode(" ".join([elem for elem in map((lambda text: text.strip()),line.itertext()) if len(elem) > 0])))
               else:
                 textContent = line.text
-                if ( textContent == None ):
-                  textContent = line.findtext("%sstrong"%(self.xmlns))
-                if ( textContent == None ):
-                  textContent = line.findtext("%sem"%(self.xmlns))
+                if ( textContent == None):
+                # Text in tag can be embeded in other tags. In that case
+                # we need to search recursively in all tags
+                # We search recursively only in tags <span> which
+                # contain only non tag span like <strong> or <em>
+                  span_child = 0
+                  for child_tag in line.iter("%sspan"%(self.xmlns)):
+                    print(child_tag)
+                    span_child = span_child + 1
+                # The line.tag contain no other <span> tag.
+                # It can contains some text. We search recursively
+                # in all tags contained in this <span> tag
+                  if span_child == 1:
+                    for string_text in line.itertext():
+                      if string_text != None:
+                         textContent = string_text
+                         break
                 if ( textContent == None ):
                   textContent = u""
                 textContent = textContent.rstrip()

diff --git a/README.md b/README.md
@@ -34,5 +34,6 @@ Maybe it's useful for others trying to understand OCR.
 Changes by tristelune1
 
 - this script is for python3
+- text is search recursively in span tags 
 
 Work in progress.