jiangge
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎dataset/arxiv.py
+66-30 b/‎dataset/arxiv.py
+66-30
diff --git a/‎dataset/dataset.py
+59-7 b/‎dataset/dataset.py
+59-7
diff --git a/‎dataset/demacro.py
+96 b/‎dataset/demacro.py
+96
@@ -11,7 +11,7 @@ The goal of this project is to create a learning based system that takes an imag
 The `pix2tex.py` file offers a fast way to get the model prediction of an image. First you need to copy the formula image into the clipboard memory for example by using a snipping tool (on Windows built in `Win`+`Shift`+`S`). Next just call the script with `python pix2tex.py`. It will print out the predicted Latex code for that image and also copy it into your clipboard.
 
 ## Data
-We need paired data for the network to learn. Luckily there is a lot of LaTeX code on the internet, e.g. [wikipedia](www.wikipedia.org), [arXiv](www.arxiv.org). We also use the formulae from the [im2latex-170k](https://www.kaggle.com/rvente/im2latex170k) dataset.
+We need paired data for the network to learn. Luckily there is a lot of LaTeX code on the internet, e.g. [wikipedia](www.wikipedia.org), [arXiv](www.arxiv.org). We also use the formulae from the [im2latex-100k](https://zenodo.org/record/56198#.V2px0jXT6eA) dataset.
 
 ### Fonts
 Latin Modern Math, GFSNeohellenicMath.otf, Asana Math, XITS Math, Cambria Math
 
@@ -6,6 +6,9 @@
 import glob
 import re
 import sys
+import argparse
+import logging
+import shutil
 import subprocess
 import tarfile
 import tempfile
@@ -17,9 +20,11 @@
 try:
     from extract_latex import *
     from scraping import *
+    from demacro import *
 except:
     from dataset.extract_latex import *
     from dataset.scraping import *
+    from dataset.demacro import *
 
 # logging.getLogger().setLevel(logging.INFO)
 arxiv_id = re.compile(r'(?<!\d)(\d{4}\.\d{5})(?!\d)')
@@ -49,73 +54,104 @@ def download(url, dir_path='./'):
         return 0
 
 
-def read_tex_files(file_path):
+def read_tex_files(file_path, demacro=True):
     tex = ''
     try:
         with tempfile.TemporaryDirectory() as tempdir:
-            tf = tarfile.open(file_path, 'r')
-            tf.extractall(tempdir)
-            tf.close()
-            texfiles = [os.path.abspath(x) for x in glob.glob(os.path.join(tempdir, '**', '*.tex'), recursive=True)]
-            # de-macro
-            ret = subprocess.run(['de-macro', *texfiles], cwd=tempdir, capture_output=True)
-            if ret.returncode == 0:
-                texfiles = glob.glob(os.path.join(tempdir, '**', '*-clean.tex'), recursive=True)
+            try:
+                tf = tarfile.open(file_path, 'r')
+                tf.extractall(tempdir)
+                tf.close()
+                texfiles = [os.path.abspath(x) for x in glob.glob(os.path.join(tempdir, '**', '*.tex'), recursive=True)]
+                # de-macro
+                if demacro:
+                    ret = subprocess.run(['de-macro', *texfiles], cwd=tempdir, capture_output=True)
+                    if ret.returncode == 0:
+                        texfiles = glob.glob(os.path.join(tempdir, '**', '*-clean.tex'), recursive=True)
+            except tarfile.ReadError as e:
+                texfiles = [file_path]  # [os.path.join(tempdir, file_path+'.tex')]
+                #shutil.move(file_path, texfiles[0])
+
             for texfile in texfiles:
                 try:
-                    tex += open(texfile, 'r', encoding=chardet.detect(open(texfile, 'br').readline())['encoding']).read()
+                    tex += open(texfile, 'r', encoding=chardet.detect(open(texfile, 'br').readline())['encoding']).read()                
                 except UnicodeDecodeError:
                     pass
-
-    except tarfile.ReadError:
-        try:
-            tex += open(file_path, 'r', encoding=chardet.detect(open(file_path, 'br').readline())['encoding']).read()
-        except Exception as e:
-            logging.info('Could not read %s: %s' % (file_path, str(e)))
-            pass
+            tex = unfold(convert(tex))
+    except Exception as e:
+        logging.debug('Could not read %s: %s' % (file_path, str(e)))
+        pass
     # remove comments
     return re.sub(r'(?<!\\)%.*\n', '', tex)
 
 
-def read_paper(arxiv_id, dir_path='./'):
+def download_paper(arxiv_id, dir_path='./'):
     url = arxiv_base + arxiv_id
-    targz_path = download(url, dir_path)
+    return download(url, dir_path)
+
+
+def read_paper(targz_path, delete=True, demacro=True):
     paper = ''
     if targz_path != 0:
-        paper = read_tex_files(targz_path)
-        os.remove(targz_path)
+        paper = read_tex_files(targz_path, demacro)
+        if delete:
+            os.remove(targz_path)
     return paper
 
 
-def parse_arxiv(id):
+def parse_arxiv(id, demacro=True):
     tempdir = tempfile.gettempdir()
-    text = read_paper(id, tempdir)
+    text = read_paper(download_paper(id, tempdir), demacro=demacro)
     #print(text, file=open('paper.tex', 'w'))
     #linked = list(set([l for l in re.findall(arxiv_id, text)]))
 
     return find_math(text, wiki=False), []
 
 
 if __name__ == '__main__':
-    skips = os.path.join(sys.path[0], 'dataset', 'data', 'visited_arxiv.txt')
+    parser = argparse.ArgumentParser(description='Extract math from arxiv')
+    parser.add_argument('-m', '--mode', default='top100', choices=['top100', 'id', 'dir'],
+                        help='Where to extract code from. top100: current 100 arxiv papers, id: specific arxiv ids. \
+                              Usage: `python arxiv.py -m id id001 id002`, dir: a folder full of .tar.gz files. Usage: `python arxiv.py -m dir directory`')
+    parser.add_argument(nargs='+', dest='args', default=[])
+    parser.add_argument('-o', '--out', default=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data'), help='output directory')
+    parser.add_argument('-d', '--no-demacro', dest='demacro', action='store_false', help='Use de-macro (Slows down extraction but improves quality)')
+    args = parser.parse_args()
+    if '.' in args.out:
+        args.out = os.path.dirname(args.out)
+    skips = os.path.join(args.out, 'visited_arxiv.txt')
     if os.path.exists(skips):
         skip = open(skips, 'r', encoding='utf-8').read().split('\n')
     else:
         skip = []
-    if len(sys.argv) > 1:
-        arxiv_ids = sys.argv[1:]
-        visited, math = recursive_search(parse_arxiv, arxiv_ids, skip=skip, unit='paper')
-
-    else:
+    if args.mode == 'ids':
+        visited, math = recursive_search(parse_arxiv, args.args, skip=skip, unit='paper')
+    elif args.mode == 'top100':
         url = 'https://arxiv.org/list/hep-th/2012?skip=0&show=100'  # https://arxiv.org/list/hep-th/2012?skip=0&show=100
         ids = get_all_arxiv_ids(requests.get(url).text)
         math, visited = [], ids
         for id in tqdm(ids):
             m, _ = parse_arxiv(id)
             math.extend(m)
+    elif args.mode == 'dir':
+        dirs = os.listdir(args.args[0])
+        math, visited = [], []
+        for f in tqdm(dirs):
+            try:
+                text = read_paper(os.path.join(args.args[0], f), False, args.demacro)
+                math.extend(find_math(text, wiki=False))
+                visited.append(os.path.basename(f))
+            except Exception as e:
+                logging.debug(e)
+                pass
+    else:
+        raise NotImplementedError
 
     for l, name in zip([visited, math], ['visited_arxiv.txt', 'math_arxiv.txt']):
-        f = open(os.path.join(sys.path[0], 'dataset', 'data', name), 'a', encoding='utf-8')
+        f = os.path.join(args.out, name)
+        if not os.path.exists(f):
+            open(f, 'w').write('')
+        f = open(f, 'a', encoding='utf-8')
         for element in l:
             f.write(element)
             f.write('\n')
 
@@ -12,7 +12,49 @@
 from collections import defaultdict
 import pickle
 from PIL import Image
+import cv2
 from transformers import PreTrainedTokenizerFast
+from tqdm.auto import tqdm
+import albumentations as alb
+from albumentations.pytorch import ToTensorV2
+
+
+class AugWrap:
+    def __init__(self, aug):
+        self.aug = aug
+
+    def __call__(self, image):
+        return self.aug(image=image)['image'][:1]  # /255
+
+
+train_transform = AugWrap(
+    alb.Compose(
+        [
+            alb.Compose(
+                [alb.ShiftScaleRotate(shift_limit=0, scale_limit=(-.15, 0), rotate_limit=1, border_mode=0, interpolation=3,
+                                      value=[255, 255, 255], p=1),
+                 alb.GridDistortion(distort_limit=0.1, border_mode=0, interpolation=3, value=[255, 255, 255], p=.5)], p=.15),
+            alb.InvertImg(p=.15),
+            alb.RGBShift(r_shift_limit=15, g_shift_limit=15,
+                         b_shift_limit=15, p=0.3),
+            alb.GaussNoise(10, p=.2),
+            alb.RandomBrightnessContrast(.05, (-.2, 0), True, p=0.2),
+            alb.JpegCompression(95, p=.5),
+            alb.ToGray(always_apply=True),
+            alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
+            # alb.Sharpen()
+            ToTensorV2(),
+        ]
+    ))
+test_transform = AugWrap(
+    alb.Compose(
+        [
+            alb.ToGray(always_apply=True),
+            alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
+            # alb.Sharpen()
+            ToTensorV2(),
+        ]
+    ))
 
 
 class Im2LatexDataset:
@@ -26,8 +68,9 @@ class Im2LatexDataset:
     pad_token_id = 0
     bos_token_id = 1
     eos_token_id = 2
+    transform = train_transform
 
-    def __init__(self, equations=None, images=None, tokenizer=None, shuffle=True, batchsize=16, max_dimensions=(1024, 512), keep_smaller_batches=False):
+    def __init__(self, equations=None, images=None, tokenizer=None, shuffle=True, batchsize=16, max_dimensions=(1024, 512), pad=False, keep_smaller_batches=False, test=False):
         """Generates a torch dataset from pairs of `equations` and `images`.
 
         Args:
@@ -37,37 +80,41 @@ def __init__(self, equations=None, images=None, tokenizer=None, shuffle=True, ba
             shuffle (bool, opitonal): Defaults to True. 
             batchsize (int, optional): Defaults to 16.
             max_dimensions (tuple(int, int), optional): Maximal dimensions the model can handle
+            pad (bool): Pad the images to `max_dimensions`. Defaults to False.
             keep_smaller_batches (bool): Whether to also return batches with smaller size than `batchsize`. Defaults to False.
+            test (bool): Whether to use the test transformation or not. Defaults to False.
         """
 
         if images is not None and equations is not None:
             assert tokenizer is not None
-            self.images = [path.replace('\\', '/') for path in glob.glob(join(images, '*.png'))] 
+            self.images = [path.replace('\\', '/') for path in glob.glob(join(images, '*.png'))]
             self.sample_size = len(self.images)
             eqs = open(equations, 'r').read().split('\n')
             self.indices = [int(os.path.basename(img).split('.')[0]) for img in self.images]
             self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer)
             self.shuffle = shuffle
             self.batchsize = batchsize
             self.max_dimensions = max_dimensions
+            self.pad = pad
             self.keep_smaller_batches = keep_smaller_batches
+            self.test = test
             self.data = defaultdict(lambda: [])
             # check the image dimension for every image and group them together
-            for i, im in enumerate(self.images):
+            for i, im in tqdm(enumerate(self.images), total=len(self.images)):
                 width, height = imagesize.get(im)
                 if width <= max_dimensions[0] and height <= max_dimensions[1]:
                     self.data[(width, height)].append((eqs[self.indices[i]], im))
             self.data = dict(self.data)
             self._get_size()
 
-            self.transform = transforms.Compose([transforms.PILToTensor()])  # , transforms.Normalize([200],[255/2]),transforms.RandomPerspective(fill=0)])
             iter(self)
 
     def __len__(self):
         return self.size
 
     def __iter__(self):
         self.i = 0
+        self.transform = test_transform if self.test else train_transform
         self.pairs = []
         for k in self.data:
             info = np.array(self.data[k], dtype=object)
@@ -105,12 +152,17 @@ def prepare_data(self, batch):
         eqs, ims = batch.T
         images = []
         for path in list(ims):
-            images.append(self.transform(Image.open(path)))
+            im = cv2.imread(path)
+            if im is None:
+                print(path, 'not found!')
+                continue
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+            images.append(self.transform(im))
         tok = self.tokenizer(list(eqs), return_token_type_ids=False)
         # pad with bos and eos token
         for k, p in zip(tok, [[self.bos_token_id, self.eos_token_id], [1, 1]]):
             tok[k] = pad_sequence([torch.LongTensor([p[0]]+x+[p[1]]) for x in tok[k]], batch_first=True, padding_value=self.pad_token_id)
-        images = torch.cat(images).float().unsqueeze(1)/255
+        images = torch.cat(images).float().unsqueeze(1)
         if self.pad:
             h, w = images.shape[2:]
             images = F.pad(images, (0, self.max_dimensions[0]-w, 0, self.max_dimensions[1]-h), value=1)
@@ -142,7 +194,7 @@ def save(self, filename):
             pickle.dump(self, file)
 
     def update(self, **kwargs):
-        for k in ['batchsize', 'shuffle', 'pad', 'keep_smaller_batches']:
+        for k in ['batchsize', 'shuffle', 'pad', 'keep_smaller_batches', 'test']:
             if k in kwargs:
                 setattr(self, k, kwargs[k])
         if 'max_dimensions' in kwargs:
 
@@ -0,0 +1,96 @@
+# modified from https://tex.stackexchange.com/a/521639
+
+import argparse
+import re
+
+
+def main():
+    args = parse_command_line()
+    data = read(args.input)
+    data = convert(data)
+    if args.demacro:
+        data = unfold(data)
+    write(args.output, data)
+
+
+def parse_command_line():
+    parser = argparse.ArgumentParser(description='Replace \\def with \\newcommand where possible.')
+    parser.add_argument('input', help='TeX input file with \\def')
+    parser.add_argument('--output', '-o', required=True, help='TeX output file with \\newcommand')
+    parser.add_argument('--demacro', action='store_true', help='replace all commands with their definition')
+
+    return parser.parse_args()
+
+
+def read(path):
+    with open(path, mode='r') as handle:
+        return handle.read()
+
+
+def convert(data):
+    return re.sub(
+        r'((?:\\(?:expandafter|global|long|outer|protected)'
+        r'(?: +|\r?\n *)?)*)?'
+        r'\\def *(\\[a-zA-Z]+) *(?:#+([0-9]))*\{',
+        replace,
+        data,
+    )
+
+
+def unfold(t):
+    cmds = re.findall(r'\\(?:re)?newcommand\*?{\\(.+?)}\s*(\[\d\])?(\[.+?\])?{(.+?)}\n', t)
+    cmds = sorted(cmds, key=lambda x: len(x[0]))
+    # print(cmds)
+    for c in cmds:
+        nargs = int(c[1][1]) if c[1] != r'' else 0
+        # print(c)
+        if nargs == 0:
+            #t = t.replace(r'\\%s' % c[0], c[-1])
+            t = re.sub(r'\\%s([\W_^\d])' % c[0], r'%s\1' % c[-1].replace('\\', r'\\'), t)
+        else:
+            matches = re.findall(r'(\\%s(?:\[(.+?)\])?' % c[0]+r'{(.+?)}'*(nargs-(1 if c[2] != r'' else 0))+r')', t)
+            # print(matches)
+            for i, m in enumerate(matches):
+                r = c[-1]
+                if m[1] == r'':
+                    matches[i] = (m[0], c[2][1:-1], *m[2:])
+                for j in range(1, nargs+1):
+                    r = r.replace(r'#%i' % j, matches[i][j])
+                t = t.replace(matches[i][0], r)
+    return t
+
+
+def replace(match):
+    prefix = match.group(1)
+    if (
+            prefix is not None and
+            (
+                'expandafter' in prefix or
+                'global' in prefix or
+                'outer' in prefix or
+                'protected' in prefix
+            )
+    ):
+        return match.group(0)
+
+    result = r'\newcommand'
+    if prefix is None or 'long' not in prefix:
+        result += '*'
+
+    result += '{' + match.group(2) + '}'
+    if match.lastindex == 3:
+        result += '[' + match.group(3) + ']'
+
+    result += '{'
+    return result
+
+
+def write(path, data):
+    with open(path, mode='w') as handle:
+        handle.write(data)
+
+    print('=> File written: {0}'.format(path))
+
+
+if __name__ == '__main__':
+    main()