auto download latest checkpoints

lukas-blecher · lukas-blecher · commit b3069e32667f · 2022-03-28T13:52:04.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -133,6 +133,7 @@ notebooks/
 dataset/data/**
 wandb/
 checkpoints/**
+!checkpoints/*.py
 !**/.gitkeep
 .vscode
 .DS_Store
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ In order to render the math in many different fonts we use  XeLaTeX, generate a
 ## Using the model
 1. Download/Clone this repository
 2. For now you need to install the Python dependencies specified in `requirements.txt` (look [above](#Requirements))
-3. Download the `weights.pth` (and optionally `image_resizer.pth`) file from the [Releases](https://github.com/lukas-blecher/LaTeX-OCR/releases/latest)->Assets section and place it in the `checkpoints` directory
+3. The latest model checkpoint will be downloaded the first time the program is executed. Alternatively you can download the `weights.pth` (and optionally `image_resizer.pth`) file from the [Releases](https://github.com/lukas-blecher/LaTeX-OCR/releases/latest)->Assets section and place it in the `checkpoints` directory
 
 Thanks to [@katie-lim](https://github.com/katie-lim), you can use a nice user interface as a quick way to get the model prediction. Just call the GUI with `python gui.py`. From here you can take a screenshot and the predicted latex code is rendered using [MathJax](https://www.mathjax.org/) and copied to your clipboard.
 
diff --git a/checkpoints/.gitkeep b/checkpoints/.gitkeep
diff --git a/checkpoints/get_latest_checkpoint.py b/checkpoints/get_latest_checkpoint.py
@@ -0,0 +1,27 @@
+import requests
+import os
+
+url = 'https://github.com/lukas-blecher/LaTeX-OCR/releases/latest'
+
+
+def get_latest_tag():
+    r = requests.get(url)
+    tag = r.url.split('/')[-1]
+    if tag == 'releases':
+        return 'v0.0.1'
+    return tag
+
+
+def download_checkpoints():
+    tag = get_latest_tag()
+    path = os.path.dirname(__file__)
+    print('download weights', tag, 'to path', path)
+    weights = 'https://github.com/lukas-blecher/LaTeX-OCR/releases/download/%s/weights.pth' % tag
+    resizer = 'https://github.com/lukas-blecher/LaTeX-OCR/releases/download/%s/image_resizer.pth' % tag
+    for url, name in zip([weights, resizer], ['weights.pth', 'resizer.pth']):
+        r = requests.get(url, allow_redirects=True)
+        open(os.path.join(path, name), "wb").write(r.content)
+
+
+if __name__ == '__main__':
+    download_checkpoints()
diff --git a/models.py b/models.py
@@ -2,8 +2,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from x_transformers import *
-from x_transformers.autoregressive_wrapper import *
+# from x_transformers import *
+from x_transformers import TransformerWrapper, Decoder
+from x_transformers.autoregressive_wrapper import AutoregressiveWrapper, top_k, top_p, entmax, ENTMAX_ALPHA
 from timm.models.vision_transformer import VisionTransformer
 from timm.models.vision_transformer_hybrid import HybridEmbed
 from timm.models.resnetv2 import ResNetV2
@@ -16,7 +17,7 @@ def __init__(self, *args, **kwargs):
         super(CustomARWrapper, self).__init__(*args, **kwargs)
 
     @torch.no_grad()
-    def generate(self, start_tokens, seq_len, eos_token=None, temperature=1., filter_logits_fn=top_k, filter_thres=0.9, **kwargs):
+    def forward(self, start_tokens, seq_len=256, eos_token=None, temperature=1., filter_logits_fn=top_k, filter_thres=0.9, **kwargs):
         device = start_tokens.device
         was_training = self.net.training
         num_dims = len(start_tokens.shape)
@@ -42,9 +43,6 @@ def generate(self, start_tokens, seq_len, eos_token=None, temperature=1., filter
                 filtered_logits = filter_logits_fn(logits, thres=filter_thres)
                 probs = F.softmax(filtered_logits / temperature, dim=-1)
 
-            elif filter_logits_fn is entmax:
-                probs = entmax(logits / temperature, alpha=ENTMAX_ALPHA, dim=-1)
-
             sample = torch.multinomial(probs, 1)
 
             out = torch.cat((out, sample), dim=-1)
@@ -150,6 +148,6 @@ def embed_layer(**x):
         seq = torch.randint(0, args.num_tokens, (args.batchsize, args.max_seq_len), device=args.device).long()
         decoder(seq, context=encoder(im)).sum().backward()
         model.zero_grad()
-        torch.cuda.empty_cache() 
+        torch.cuda.empty_cache()
         del im, seq
     return model
diff --git a/pix2tex.py b/pix2tex.py
@@ -21,6 +21,7 @@
 from dataset.latex2png import tex2pil
 from models import get_model
 from utils import *
+from checkpoints.get_latest_checkpoint import download_checkpoints
 
 last_pic = None
 
@@ -50,7 +51,8 @@ def initialize(arguments=None):
     args.update(**vars(arguments))
     args.wandb = False
     args.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu'
-
+    if not os.path.exists(args.checkpoint):
+        download_checkpoints()
     model = get_model(args)
     model.load_state_dict(torch.load(args.checkpoint, map_location=args.device))
 
@@ -82,9 +84,10 @@ def call_model(args, model, image_resizer, tokenizer, img=None):
     if image_resizer is not None and not args.no_resize:
         with torch.no_grad():
             input_image = img.convert('RGB').copy()
-            r, w = 1, input_image.size[0]
+            r, w, h = 1, input_image.size[0], input_image.size[1]
             for _ in range(10):
-                img = pad(minmax_size(input_image.resize((w, int(input_image.size[1]*r)), Image.BILINEAR if r > 1 else Image.LANCZOS), args.max_dimensions, args.min_dimensions))
+                h = int(h * r)  # height to resize
+                img = pad(minmax_size(input_image.resize((w, h), Image.BILINEAR if r > 1 else Image.LANCZOS), args.max_dimensions, args.min_dimensions))
                 t = test_transform(image=np.array(img.convert('RGB')))['image'][:1].unsqueeze(0)
                 w = (image_resizer(t.to(args.device)).argmax(-1).item()+1)*32
                 logging.info(r, img.size, (w, int(input_image.size[1]*r)))