-
Notifications
You must be signed in to change notification settings - Fork 515
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
When the blow code converts tesseract output to dataframe, The pandas.dataframe.infer_objects method is called which infers any sequence of digits (zip code, ssn numbers, loan id, etc) as float datatype.
| _data = pytesseract.image_to_data(img_content, lang=self.lang, **self.configs) |
_data = pytesseract.image_to_data(img_content, lang=self.lang, **self.configs)
res["data"] = pd.read_csv(
io.StringIO(_data), quoting=csv.QUOTE_NONE, encoding="utf-8", sep="\t"
)To Reproduce
Steps to reproduce the behavior:
- take any image which has just sequence of digits and run TesseractAgent.detect() function
refer the attached screenshot
Environment
This bug is platform-independent tried on both Windows and Linux
Screenshots
Code
Image
Cannot attach the full image, because of security reasons

on another image

when gather_data function is called it returns an error too
Traceback
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_37746/999922218.py in <module>
----> 1 ocr_agent.gather_data(res, agg_level=TesseractFeatureType.LINE)
~/.virtualenvs/appdev/lib/python3.8/site-packages/layoutparser/ocr/tesseract_agent.py in gather_data(response, agg_level)
140 res = response["data"]
141 df = (
--> 142 res[~res.text.isna()]
143 .groupby(agg_level.group_levels)
144 .apply(
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
1270 with option_context("mode.chained_assignment", None):
1271 try:
-> 1272 result = self._python_apply_general(f, self._selected_obj)
1273 except TypeError:
1274 # gh-20949
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f, data)
1304 data after applying f
1305 """
-> 1306 keys, values, mutated = self.grouper.apply(f, data, self.axis)
1307
1308 return self._wrap_applied_output(
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
781 try:
782 sdata = splitter.sorted_data
--> 783 result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
784
785 except IndexError:
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/groupby/ops.py in fast_apply(self, f, sdata, names)
1326 # must return keys::list, values::list, mutated::bool
1327 starts, ends = lib.generate_slices(self.slabels, self.ngroups)
-> 1328 return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
1329
1330 def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction.apply_frame_axis0()
~/.virtualenvs/appdev/lib/python3.8/site-packages/layoutparser/ocr/tesseract_agent.py in <lambda>(gp)
150 gp["height"].max(),
151 gp["conf"].mean(),
--> 152 gp["text"].str.cat(sep=" "),
153 ]
154 )
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__(self, name)
5485 ):
5486 return self[name]
-> 5487 return object.__getattribute__(self, name)
5488
5489 def __setattr__(self, name: str, value) -> None:
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/accessor.py in __get__(self, obj, cls)
179 # we're accessing the attribute of the class, i.e., Dataset.geo
180 return self._accessor
--> 181 accessor_obj = self._accessor(obj)
182 # Replace the property with the accessor object. Inspired by:
183 # https://www.pydanny.com/cached-property.html
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/strings/accessor.py in __init__(self, data)
166 from pandas.core.arrays.string_ import StringDtype
167
--> 168 self._inferred_dtype = self._validate(data)
169 self._is_categorical = is_categorical_dtype(data.dtype)
170 self._is_string = isinstance(data.dtype, StringDtype)
~/.virtualenvs/appdev/lib/python3.8/site-packages/pandas/core/strings/accessor.py in _validate(data)
223
224 if inferred_dtype not in allowed_types:
--> 225 raise AttributeError("Can only use .str accessor with string values!")
226 return inferred_dtype
227
AttributeError: Can only use .str accessor with string values!Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working
