forked from lukas-blecher/LaTeX-OCR
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
27b620f
commit 5fca76e
Showing
14 changed files
with
162 additions
and
183 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,15 @@ | ||
pix2tex.structures.hybrid package | ||
====================== | ||
pix2tex.models.hybrid package | ||
============================= | ||
|
||
.. automodule:: pix2tex.structures.hybrid | ||
.. automodule:: pix2tex.models.hybrid | ||
:members: | ||
:no-undoc-members: | ||
:show-inheritance: | ||
|
||
pix2tex.models.vit package | ||
========================== | ||
|
||
.. automodule:: pix2tex.models.vit | ||
:members: | ||
:no-undoc-members: | ||
:show-inheritance: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .utils import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import torch | ||
import torch.nn as nn | ||
|
||
from x_transformers import TransformerWrapper, Encoder, Decoder | ||
from x_transformers.autoregressive_wrapper import AutoregressiveWrapper | ||
|
||
from . import hybrid | ||
from . import vit | ||
|
||
|
||
class Model(nn.Module): | ||
def __init__(self, encoder: Encoder, decoder: AutoregressiveWrapper, args): | ||
super().__init__() | ||
self.encoder = encoder | ||
self.decoder = decoder | ||
self.args = args | ||
|
||
def forward(self, x: torch.Tensor): | ||
return self.decoder.generate(torch.LongTensor([self.args.bos_token]*len(x)).to(x.device), self.args.max_seq_len, eos_token=self.args.eos_token, context=self.encoder(x)) | ||
|
||
|
||
def get_model(args, training=False): | ||
if args.encoder_structure.lower() == 'vit': | ||
encoder = vit.get_encoder(args) | ||
elif args.encoder_structure.lower() == 'hybrid': | ||
encoder = hybrid.get_encoder(args) | ||
else: | ||
raise NotImplementedError('Encoder structure "%s" not supported.' % args.encoder_structure) | ||
decoder = AutoregressiveWrapper( | ||
TransformerWrapper( | ||
num_tokens=args.num_tokens, | ||
max_seq_len=args.max_seq_len, | ||
attn_layers=Decoder( | ||
dim=args.dim, | ||
depth=args.num_layers, | ||
heads=args.heads, | ||
cross_attend=True | ||
)), | ||
pad_value=args.pad_token | ||
) | ||
available_gpus = torch.cuda.device_count() | ||
if available_gpus > 1: | ||
encoder = nn.DataParallel(encoder) | ||
decoder = nn.DataParallel(decoder) | ||
encoder.to(args.device) | ||
decoder.to(args.device) | ||
if args.wandb: | ||
import wandb | ||
en_attn_layers = encoder.module.attn_layers if available_gpus > 1 else encoder.attn_layers | ||
de_attn_layers = decoder.module.net.attn_layers if available_gpus > 1 else decoder.net.attn_layers | ||
wandb.watch((en_attn_layers, de_attn_layers)) | ||
model = Model(encoder, decoder, args) | ||
if training: | ||
# check if largest batch can be handled by system | ||
batchsize = args.batchsize if args.get( | ||
'micro_batchsize', -1) == -1 else args.micro_batchsize | ||
im = torch.empty(batchsize, args.channels, args.max_height, | ||
args.min_height, device=args.device).float() | ||
seq = torch.randint(0, args.num_tokens, (batchsize, | ||
args.max_seq_len), device=args.device).long() | ||
decoder(seq, context=encoder(im)).sum().backward() | ||
model.zero_grad() | ||
torch.cuda.empty_cache() | ||
del im, seq | ||
return model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import torch | ||
import torch.nn as nn | ||
|
||
from x_transformers import Encoder | ||
from einops import rearrange, repeat | ||
|
||
|
||
class ViTransformerWrapper(nn.Module): | ||
def __init__( | ||
self, | ||
*, | ||
max_width, | ||
max_height, | ||
patch_size, | ||
attn_layers, | ||
channels=1, | ||
num_classes=None, | ||
dropout=0., | ||
emb_dropout=0. | ||
): | ||
super().__init__() | ||
assert isinstance(attn_layers, Encoder), 'attention layers must be an Encoder' | ||
assert max_width % patch_size == 0 and max_height % patch_size == 0, 'image dimensions must be divisible by the patch size' | ||
dim = attn_layers.dim | ||
num_patches = (max_width // patch_size)*(max_height // patch_size) | ||
patch_dim = channels * patch_size ** 2 | ||
|
||
self.patch_size = patch_size | ||
self.max_width = max_width | ||
self.max_height = max_height | ||
|
||
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) | ||
self.patch_to_embedding = nn.Linear(patch_dim, dim) | ||
self.cls_token = nn.Parameter(torch.randn(1, 1, dim)) | ||
self.dropout = nn.Dropout(emb_dropout) | ||
|
||
self.attn_layers = attn_layers | ||
self.norm = nn.LayerNorm(dim) | ||
#self.mlp_head = FeedForward(dim, dim_out = num_classes, dropout = dropout) if exists(num_classes) else None | ||
|
||
def forward(self, img, **kwargs): | ||
p = self.patch_size | ||
|
||
x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p) | ||
x = self.patch_to_embedding(x) | ||
b, n, _ = x.shape | ||
|
||
cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) | ||
x = torch.cat((cls_tokens, x), dim=1) | ||
h, w = torch.tensor(img.shape[2:])//p | ||
pos_emb_ind = repeat(torch.arange(h)*(self.max_width//p-w), 'h -> (h w)', w=w)+torch.arange(h*w) | ||
pos_emb_ind = torch.cat((torch.zeros(1), pos_emb_ind+1), dim=0).long() | ||
x += self.pos_embedding[:, pos_emb_ind] | ||
x = self.dropout(x) | ||
|
||
x = self.attn_layers(x, **kwargs) | ||
x = self.norm(x) | ||
|
||
return x | ||
|
||
|
||
def get_encoder(args): | ||
return ViTransformerWrapper( | ||
max_width=args.max_width, | ||
max_height=args.max_height, | ||
channels=args.channels, | ||
patch_size=args.patch_size, | ||
emb_dropout=args.get('emb_dropout', 0), | ||
attn_layers=Encoder( | ||
dim=args.dim, | ||
depth=args.num_layers, | ||
heads=args.heads, | ||
) | ||
) |
Oops, something went wrong.