Skip to content

Commit

Permalink
fix: fix hard code region in parser
Browse files Browse the repository at this point in the history
  • Loading branch information
IcyKallen committed May 20, 2024
1 parent 003c5fa commit 98d5ede
Showing 1 changed file with 23 additions and 16 deletions.
39 changes: 23 additions & 16 deletions source/containers/document-pii-detection/parsers/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
import magic
import re
from tempfile import NamedTemporaryFile

import boto3
import magic

def merge_strings(substrings, delimiter='.', max_length=128, truncate_buffer=False):

def merge_strings(substrings, delimiter=".", max_length=128, truncate_buffer=False):
"""
Merges a list of strings into a list of strings with a maximum length.
Expand All @@ -20,7 +22,7 @@ def merge_strings(substrings, delimiter='.', max_length=128, truncate_buffer=Fal
if a substring in original substrings is longer than max_length, it will still be in merged substrings.
"""
merged_strings = []
buffer = ''
buffer = ""
for substring in substrings:
if len(buffer + delimiter + substring) > max_length:
buffer = buffer[:max_length] if truncate_buffer else buffer
Expand All @@ -33,18 +35,19 @@ def merge_strings(substrings, delimiter='.', max_length=128, truncate_buffer=Fal
merged_strings.append(buffer)
return merged_strings


class BaseParser:
def __init__(self, s3_client):
# constructor code here
# self.region = region
self.s3_client=s3_client
self.s3_client = s3_client
pass

def parse_file(self, file_path, **kwargs):
"""This method must be overwritten by child classes to extract raw
text from a file path.
text from a file path.
"""
raise NotImplementedError('must be overwritten by child classes')
raise NotImplementedError("must be overwritten by child classes")

def load_content(self, bucket, object_key):
"""
Expand All @@ -63,39 +66,43 @@ def load_content(self, bucket, object_key):
processed_content = self.postprocess_content(file_content)
"""
# begin download obj into memory
s3 = boto3.resource('s3', region_name='cn-northwest-1')
s3 = boto3.resource("s3", region_name=self.s3_client.meta.region_name)
obj = s3.Object(bucket, object_key).get()

try:
file_content = self.parse_file(obj['Body'].read())
file_content = self.parse_file(obj["Body"].read())
except Exception as e:
print(f"Failed to parse file {object_key}. Error: {e}")
file_content = []
processed_content = self.postprocess_content(file_content)
# end download obj into memory

return processed_content

def postprocess_content(self, file_content):
"""
For each item in content, if size is bigger than 128, split it into multiple items.
"""
# split all_page_content into a list of lines and remove empty lines
processed_content=[]
processed_content = []
for page in file_content:
# page_content = []
lines = [line for line in page.splitlines() if line.strip() != '']
lines = [line for line in page.splitlines() if line.strip() != ""]

for item in lines:
if len(item) > 128:
# Split item by . and extend to processed_content
re_split_items = re.split(r'(?<=[.。;])', item)
merged_split_items = merge_strings(re_split_items, delimiter='', max_length=128)
re_split_items = re.split(r"(?<=[.。;])", item)
merged_split_items = merge_strings(
re_split_items, delimiter="", max_length=128
)
processed_content.extend(merged_split_items)
else:
processed_content.extend([item])

processed_content = merge_strings(processed_content, delimiter=' ', max_length=128, truncate_buffer=True)
processed_content = merge_strings(
processed_content, delimiter=" ", max_length=128, truncate_buffer=True
)
processed_content = processed_content[:10000]
return processed_content

Expand All @@ -104,8 +111,8 @@ def get_encoding(self, file_path):
Returns the encoding of the file.
"""

with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
blob = f.read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
return encoding
return encoding

0 comments on commit 98d5ede

Please sign in to comment.