Skip to content

Commit

Permalink
Merge pull request #65 from yutanagano/develop
Browse files Browse the repository at this point in the history
Minor update to v2.1.0
  • Loading branch information
yutanagano authored Mar 2, 2024
2 parents 8999d10 + 6616c55 commit 9952583
Show file tree
Hide file tree
Showing 14 changed files with 3,675 additions and 95 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# tidytcells
<h1 align="center">
<img src="tidytcells.png" width=700>
</h1>

![Tests](https://github.com/yutanagano/tidytcells/actions/workflows/tests.yaml/badge.svg)
[![Docs](https://readthedocs.org/projects/tidytcells/badge/?version=latest)](https://tidytcells.readthedocs.io)
![License](https://img.shields.io/badge/license-MIT-blue)
[![License](https://img.shields.io/badge/license-MIT-blue)](https://github.com/yutanagano/tidytcells?tab=MIT-1-ov-file#readme)

`tidytcells` is a lightweight python package that cleans and standardizes T cell receptor (TR) and Major Histocompatibility (MH) data to be [IMGT](https://www.imgt.org/)-compliant.
The main purpose of the package is to solve the problem of parsing and collating together non-standardized TR datasets.
Expand Down Expand Up @@ -34,4 +36,4 @@ from inside the project root directory.
## Useful links

- [Documentation](https://tidytcells.readthedocs.io)
- [PyPI page](https://pypi.org/project/tidytcells)
- [PyPI page](https://pypi.org/project/tidytcells)
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.2
2.1.0
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = "sphinx_book_theme"
html_title = "tidytcells"
html_logo = "../tidytcells.png"
html_theme_options = {
"repository_url": "https://github.com/yutanagano/tidytcells",
"path_to_docs": "docs",
"use_repository_button": True,
"use_issues_button": True
"use_issues_button": True,
}
# html_static_path = ['_static']
4 changes: 2 additions & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to tidytcells's documentation!
======================================
tidytcells: Standardise TR/MH data
==================================

.. figure:: figure.png
:alt: Cartoon of a T cell receptor binding to a peptide MH complex (left), and a cartoon demonstrating how messy data can be cleaned using tidytcells (right).
Expand Down
62 changes: 1 addition & 61 deletions scripts/homosapiens_catalogue_tr.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
from bs4 import BeautifulSoup
import collections
import itertools
from io import StringIO
import pandas as pd
import requests
from typing import Iterable
import script_utility

Expand All @@ -18,7 +13,7 @@ def main() -> None:
script_utility.save_as_json(synonyms_data, "homosapiens_tr_synonyms.json")

print("Fetching TR gene sequence data from IMGT...")
sequence_data = get_sequence_data()
sequence_data = script_utility.get_tr_aa_sequence_data("Homo+sapiens")
script_utility.save_as_json(sequence_data, "homosapiens_tr_aa_sequences.json")


Expand Down Expand Up @@ -78,60 +73,5 @@ def get_synonyms_data(valid_alleles: Iterable[str]) -> dict:
return tr_synonyms["Approved symbol"].to_dict()


def get_sequence_data() -> dict:
v_gene_sequence_data = get_v_gene_sequence_data()
return v_gene_sequence_data


def get_v_gene_sequence_data() -> dict:
labels = ("FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "V-REGION")
gene_groups = ("TRAV", "TRBV", "TRGV", "TRDV")
data_per_gene_group_per_label = [
get_sequence_data_for_label_for_gene_group(label, gene_group)
for label, gene_group in itertools.product(labels, gene_groups)
]

combined = collections.defaultdict(dict)
for alleles_dict in data_per_gene_group_per_label:
for allele, data in alleles_dict.items():
combined[allele].update(data)

return combined


def get_sequence_data_for_label_for_gene_group(label: str, gene_group: str) -> dict:
aa_seqs = collections.defaultdict(dict)

response = requests.get(
f"https://www.imgt.org/genedb/GENElect?query=8.2+{gene_group}&species=Homo+sapiens&IMGTlabel={label}"
)
parser = BeautifulSoup(response.text, features="html.parser")
fasta = parser.find_all("pre")[1].string

current_allele = None
for line in fasta.splitlines():
if line.startswith(">"):
fields = line.split("|")
allele = fields[1]
functionality = fields[3]

if "F" in functionality:
current_allele = allele
else:
current_allele = None

continue

if current_allele is None:
continue

if not label in aa_seqs[current_allele]:
aa_seqs[current_allele][label] = line.strip()
else:
aa_seqs[current_allele][label] += line.strip()

return aa_seqs


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions scripts/musmusculus_catalogue_tr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ def main() -> None:
valid_allele_data = script_utility.get_tr_alleles_list("Mus+musculus")
script_utility.save_as_json(valid_allele_data, "valid_musmusculus_tr.json")

print("Fetching TR gene sequence data from IMGT...")
sequence_data = script_utility.get_tr_aa_sequence_data("Mus+musculus")
script_utility.save_as_json(sequence_data, "musmusculus_tr_aa_sequences.json")


if __name__ == "__main__":
main()
83 changes: 82 additions & 1 deletion scripts/script_utility.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from bs4 import BeautifulSoup
import collections
import itertools
from io import StringIO
import json
import pandas as pd
from pandas import DataFrame
from pathlib import Path
import requests
from typing import Tuple


def get_tr_alleles_list(species: str) -> dict:
Expand Down Expand Up @@ -55,7 +57,86 @@ def get_tr_alleles_for_gene_group_for_species(gene_group: str, species: str) ->
return alleles


def parse_fasta_header(line: str) -> (str, str, str):
def get_tr_aa_sequence_data(species: str) -> dict:
v_gene_sequence_data = get_v_gene_sequence_data(species)
d_gene_sequence_data = get_d_gene_sequence_data(species)
j_gene_sequence_data = get_j_gene_sequence_data(species)
return {**v_gene_sequence_data, **d_gene_sequence_data, **j_gene_sequence_data}


def get_v_gene_sequence_data(species: str) -> dict:
labels = ("FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "V-REGION")
gene_groups = ("TRAV", "TRBV", "TRGV", "TRDV")
return get_gene_sequence_data(labels, gene_groups, species)


def get_d_gene_sequence_data(species: str) -> dict:
labels = ("D-REGION",)
gene_groups = ("TRBD", "TRDD")
return get_gene_sequence_data(labels, gene_groups, species)


def get_j_gene_sequence_data(species: str) -> dict:
labels = ("FR4-IMGT", "J-REGION")
gene_groups = ("TRAJ", "TRBJ", "TRGJ", "TRDJ")
return get_gene_sequence_data(labels, gene_groups, species)


def get_gene_sequence_data(
labels: Tuple[str], gene_groups: Tuple[str], species: str
) -> dict:
data_per_gene_group_per_label = [
get_sequence_data_for_label_for_gene_group_for_species(
label, gene_group, species
)
for label, gene_group in itertools.product(labels, gene_groups)
]

combined = collections.defaultdict(dict)
for alleles_dict in data_per_gene_group_per_label:
for allele, data in alleles_dict.items():
combined[allele].update(data)

return combined


def get_sequence_data_for_label_for_gene_group_for_species(
label: str, gene_group: str, species: str
) -> dict:
aa_seqs = collections.defaultdict(dict)

response = requests.get(
f"https://www.imgt.org/genedb/GENElect?query=8.2+{gene_group}&species={species}&IMGTlabel={label}"
)
parser = BeautifulSoup(response.text, features="html.parser")
fasta = parser.find_all("pre")[1].string

current_allele = None
for line in fasta.splitlines():
if line.startswith(">"):
fields = line.split("|")
allele = fields[1]
functionality = fields[3]

if "F" in functionality:
current_allele = allele
else:
current_allele = None

continue

if current_allele is None:
continue

if not label in aa_seqs[current_allele]:
aa_seqs[current_allele][label] = line.strip()
else:
aa_seqs[current_allele][label] += line.strip()

return aa_seqs


def parse_fasta_header(line: str) -> Tuple[str]:
fields = line.split("|")
allele_name = fields[1]
gene, allele_designation = allele_name.split("*")
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
setup(
name="tidytcells",
version=VERSION,
description="Standardise TR/MH data.",
description="Standardise TR/MH data",
long_description=README,
long_description_content_type="text/markdown",
author="Yuta Nagano",
author_email="yutanagano51@proton.me",
url="https://tidytcells.readthedocs.io",
download_url="https://github.com/yutanagano/tidytcells",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
Expand Down
1 change: 1 addition & 0 deletions src/tidytcells/_resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def get_json_resource(filename: str) -> dict:


VALID_MUSMUSCULUS_TR = get_json_resource("valid_musmusculus_tr.json")
MUSMUSCULUS_TR_AA_SEQUENCES = get_json_resource("musmusculus_tr_aa_sequences.json")
VALID_MUSMUSCULUS_MH = get_json_resource("valid_musmusculus_mh.json")
MUSMUSCULUS_MH_SYNONYMS = get_json_resource("musmusculus_mh_synonyms.json")

Expand Down
Loading

0 comments on commit 9952583

Please sign in to comment.