Merge pull request #65 from yutanagano/develop

Minor update to v2.1.0
yutanagano · Mar 2, 2024 · 9952583 · 9952583
2 parents 8999d10 + 6616c55
commit 9952583
Show file tree

Hide file tree

Showing 14 changed files with 3,675 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,10 @@
-# tidytcells
+<h1 align="center">
+    <img src="tidytcells.png" width=700>
+</h1>
 
 ![Tests](https://github.com/yutanagano/tidytcells/actions/workflows/tests.yaml/badge.svg)
 [![Docs](https://readthedocs.org/projects/tidytcells/badge/?version=latest)](https://tidytcells.readthedocs.io)
-![License](https://img.shields.io/badge/license-MIT-blue)
+[![License](https://img.shields.io/badge/license-MIT-blue)](https://github.com/yutanagano/tidytcells?tab=MIT-1-ov-file#readme)
 
 `tidytcells` is a lightweight python package that cleans and standardizes T cell receptor (TR) and Major Histocompatibility (MH) data to be [IMGT](https://www.imgt.org/)-compliant.
 The main purpose of the package is to solve the problem of parsing and collating together non-standardized TR datasets.
@@ -34,4 +36,4 @@ from inside the project root directory.
 ## Useful links
 
 - [Documentation](https://tidytcells.readthedocs.io)
-- [PyPI page](https://pypi.org/project/tidytcells)
+- [PyPI page](https://pypi.org/project/tidytcells)
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-2.0.2
+2.1.0
diff --git a/docs/conf.py b/docs/conf.py
@@ -32,11 +32,11 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 html_theme = "sphinx_book_theme"
-html_title = "tidytcells"
+html_logo = "../tidytcells.png"
 html_theme_options = {
     "repository_url": "https://github.com/yutanagano/tidytcells",
     "path_to_docs": "docs",
     "use_repository_button": True,
-    "use_issues_button": True
+    "use_issues_button": True,
 }
 # html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
@@ -3,8 +3,8 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to tidytcells's documentation!
-======================================
+tidytcells: Standardise TR/MH data
+==================================
 
 .. figure:: figure.png
    :alt: Cartoon of a T cell receptor binding to a peptide MH complex (left), and a cartoon demonstrating how messy data can be cleaned using tidytcells (right).

diff --git a/scripts/homosapiens_catalogue_tr.py b/scripts/homosapiens_catalogue_tr.py
@@ -1,9 +1,4 @@
-from bs4 import BeautifulSoup
-import collections
-import itertools
-from io import StringIO
 import pandas as pd
-import requests
 from typing import Iterable
 import script_utility
 
@@ -18,7 +13,7 @@ def main() -> None:
     script_utility.save_as_json(synonyms_data, "homosapiens_tr_synonyms.json")
 
     print("Fetching TR gene sequence data from IMGT...")
-    sequence_data = get_sequence_data()
+    sequence_data = script_utility.get_tr_aa_sequence_data("Homo+sapiens")
     script_utility.save_as_json(sequence_data, "homosapiens_tr_aa_sequences.json")
 
 
@@ -78,60 +73,5 @@ def get_synonyms_data(valid_alleles: Iterable[str]) -> dict:
     return tr_synonyms["Approved symbol"].to_dict()
 
 
-def get_sequence_data() -> dict:
-    v_gene_sequence_data = get_v_gene_sequence_data()
-    return v_gene_sequence_data
-
-
-def get_v_gene_sequence_data() -> dict:
-    labels = ("FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "V-REGION")
-    gene_groups = ("TRAV", "TRBV", "TRGV", "TRDV")
-    data_per_gene_group_per_label = [
-        get_sequence_data_for_label_for_gene_group(label, gene_group)
-        for label, gene_group in itertools.product(labels, gene_groups)
-    ]
-
-    combined = collections.defaultdict(dict)
-    for alleles_dict in data_per_gene_group_per_label:
-        for allele, data in alleles_dict.items():
-            combined[allele].update(data)
-
-    return combined
-
-
-def get_sequence_data_for_label_for_gene_group(label: str, gene_group: str) -> dict:
-    aa_seqs = collections.defaultdict(dict)
-
-    response = requests.get(
-        f"https://www.imgt.org/genedb/GENElect?query=8.2+{gene_group}&species=Homo+sapiens&IMGTlabel={label}"
-    )
-    parser = BeautifulSoup(response.text, features="html.parser")
-    fasta = parser.find_all("pre")[1].string
-
-    current_allele = None
-    for line in fasta.splitlines():
-        if line.startswith(">"):
-            fields = line.split("|")
-            allele = fields[1]
-            functionality = fields[3]
-
-            if "F" in functionality:
-                current_allele = allele
-            else:
-                current_allele = None
-
-            continue
-
-        if current_allele is None:
-            continue
-
-        if not label in aa_seqs[current_allele]:
-            aa_seqs[current_allele][label] = line.strip()
-        else:
-            aa_seqs[current_allele][label] += line.strip()
-
-    return aa_seqs
-
-
 if __name__ == "__main__":
     main()
diff --git a/scripts/musmusculus_catalogue_tr.py b/scripts/musmusculus_catalogue_tr.py
@@ -6,6 +6,10 @@ def main() -> None:
     valid_allele_data = script_utility.get_tr_alleles_list("Mus+musculus")
     script_utility.save_as_json(valid_allele_data, "valid_musmusculus_tr.json")
 
+    print("Fetching TR gene sequence data from IMGT...")
+    sequence_data = script_utility.get_tr_aa_sequence_data("Mus+musculus")
+    script_utility.save_as_json(sequence_data, "musmusculus_tr_aa_sequences.json")
+
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/script_utility.py b/scripts/script_utility.py
@@ -1,11 +1,13 @@
 from bs4 import BeautifulSoup
 import collections
+import itertools
 from io import StringIO
 import json
 import pandas as pd
 from pandas import DataFrame
 from pathlib import Path
 import requests
+from typing import Tuple
 
 
 def get_tr_alleles_list(species: str) -> dict:
@@ -55,7 +57,86 @@ def get_tr_alleles_for_gene_group_for_species(gene_group: str, species: str) ->
     return alleles
 
 
-def parse_fasta_header(line: str) -> (str, str, str):
+def get_tr_aa_sequence_data(species: str) -> dict:
+    v_gene_sequence_data = get_v_gene_sequence_data(species)
+    d_gene_sequence_data = get_d_gene_sequence_data(species)
+    j_gene_sequence_data = get_j_gene_sequence_data(species)
+    return {**v_gene_sequence_data, **d_gene_sequence_data, **j_gene_sequence_data}
+
+
+def get_v_gene_sequence_data(species: str) -> dict:
+    labels = ("FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "V-REGION")
+    gene_groups = ("TRAV", "TRBV", "TRGV", "TRDV")
+    return get_gene_sequence_data(labels, gene_groups, species)
+
+
+def get_d_gene_sequence_data(species: str) -> dict:
+    labels = ("D-REGION",)
+    gene_groups = ("TRBD", "TRDD")
+    return get_gene_sequence_data(labels, gene_groups, species)
+
+
+def get_j_gene_sequence_data(species: str) -> dict:
+    labels = ("FR4-IMGT", "J-REGION")
+    gene_groups = ("TRAJ", "TRBJ", "TRGJ", "TRDJ")
+    return get_gene_sequence_data(labels, gene_groups, species)
+
+
+def get_gene_sequence_data(
+    labels: Tuple[str], gene_groups: Tuple[str], species: str
+) -> dict:
+    data_per_gene_group_per_label = [
+        get_sequence_data_for_label_for_gene_group_for_species(
+            label, gene_group, species
+        )
+        for label, gene_group in itertools.product(labels, gene_groups)
+    ]
+
+    combined = collections.defaultdict(dict)
+    for alleles_dict in data_per_gene_group_per_label:
+        for allele, data in alleles_dict.items():
+            combined[allele].update(data)
+
+    return combined
+
+
+def get_sequence_data_for_label_for_gene_group_for_species(
+    label: str, gene_group: str, species: str
+) -> dict:
+    aa_seqs = collections.defaultdict(dict)
+
+    response = requests.get(
+        f"https://www.imgt.org/genedb/GENElect?query=8.2+{gene_group}&species={species}&IMGTlabel={label}"
+    )
+    parser = BeautifulSoup(response.text, features="html.parser")
+    fasta = parser.find_all("pre")[1].string
+
+    current_allele = None
+    for line in fasta.splitlines():
+        if line.startswith(">"):
+            fields = line.split("|")
+            allele = fields[1]
+            functionality = fields[3]
+
+            if "F" in functionality:
+                current_allele = allele
+            else:
+                current_allele = None
+
+            continue
+
+        if current_allele is None:
+            continue
+
+        if not label in aa_seqs[current_allele]:
+            aa_seqs[current_allele][label] = line.strip()
+        else:
+            aa_seqs[current_allele][label] += line.strip()
+
+    return aa_seqs
+
+
+def parse_fasta_header(line: str) -> Tuple[str]:
     fields = line.split("|")
     allele_name = fields[1]
     gene, allele_designation = allele_name.split("*")

diff --git a/setup.py b/setup.py
@@ -10,11 +10,13 @@
 setup(
     name="tidytcells",
     version=VERSION,
-    description="Standardise TR/MH data.",
+    description="Standardise TR/MH data",
     long_description=README,
     long_description_content_type="text/markdown",
     author="Yuta Nagano",
     author_email="yutanagano51@proton.me",
+    url="https://tidytcells.readthedocs.io",
+    download_url="https://github.com/yutanagano/tidytcells",
     classifiers=[
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Science/Research",

diff --git a/src/tidytcells/_resources/__init__.py b/src/tidytcells/_resources/__init__.py
@@ -23,6 +23,7 @@ def get_json_resource(filename: str) -> dict:
 
 
 VALID_MUSMUSCULUS_TR = get_json_resource("valid_musmusculus_tr.json")
+MUSMUSCULUS_TR_AA_SEQUENCES = get_json_resource("musmusculus_tr_aa_sequences.json")
 VALID_MUSMUSCULUS_MH = get_json_resource("valid_musmusculus_mh.json")
 MUSMUSCULUS_MH_SYNONYMS = get_json_resource("musmusculus_mh_synonyms.json")