generated from scotthlee/template
-
Notifications
You must be signed in to change notification settings - Fork 3
/
text_removal.py
69 lines (60 loc) · 2.31 KB
/
text_removal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Filters out x-rays with burned-in text from the main dataset."""
import cv2
import numpy as np
import pandas as pd
import argparse
import os
import pytesseract
from pytesseract import Output
from multiprocessing import Pool
from hamlet.tools.image import check_text
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--img_dir',
type=str,
default='D:/data/hamlet/source/immigrant/',
help='path to the directory holding the images')
parser.add_argument('--text_dir',
type=str,
default='D:/data/hamlet/source/bad/text/',
help='path to the directory for dumping the images \
with too much text')
parser.add_argument('--num_words',
type=int,
default=3,
help='maximum allowable number of words per image')
parser.add_argument('--convert_to_grayscale',
action='store_true')
parser.add_argument('--no_multiprocessing',
action='store_true')
parser.set_defaults(no_multiprocessing=False,
convert_to_grayscale=False)
args = parser.parse_args()
# Setting globals
IMG_DIR = args.img_dir
TEXT_DIR = args.text_dir
NUM_WORDS = args.num_words
USE_MULTIPROCESSING = not args.no_multiprocessing
GRAY = args.convert_to_grayscale
# Importing the data
files = os.listdir(IMG_DIR)
# Checking the files
if USE_MULTIPROCESSING:
with Pool() as p:
input = [(IMG_DIR + f, NUM_WORDS, GRAY) for f in files]
res1 = p.starmap(check_text, input)
p.close()
p.join()
# Moving the files with text
with_text = np.where(res1)[0]
to_move = [files[i] for i in with_text]
with Pool() as p:
input = [(IMG_DIR + f, TEXT_DIR + f) for f in to_move]
res2 = p.starmap(os.rename, input)
p.close()
p.join()
else:
res1 = [check_text(IMG_DIR + f, NUM_WORDS, GRAY) for f in files]
with_text = np.where(res1)[0]
to_move = [files[i] for i in with_text]
res2 = [os.rename(IMG_DIR + f, TEXT_DIR + f) for f in to_move]