Skip to content

Commit

Permalink
Added docstrings + added type hinting.
Browse files Browse the repository at this point in the history
Fixed readme to display image.
  • Loading branch information
RSWilson1 committed Sep 13, 2023
1 parent 45e6d9f commit d3fdfb2
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 32 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Custom script for processing a list of ids (HGNC, transcript) or coordinates wit

## IGV reports output

Example Output:
[!image] (<https://raw.githubusercontent.com/eastgenomics/gene_annotation2bed/dev/data/demo/demo_igv_reports.png>)
IGV report:
![image](<https://raw.githubusercontent.com/eastgenomics/gene_annotation2bed/dev/data/demo/demo_igv_reports.png>)

The script produces a HTML report of all the bed file entries. Displayed in IGV with the refseq track
and bed file aligned with the respecive annotation.
Expand Down
67 changes: 43 additions & 24 deletions gene_annotation2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ def parse_args() -> argparse.Namespace:
"""
Parse command line arguments
Parameters
----------
None
Returns
-------
args : Namespace
Expand Down Expand Up @@ -70,7 +74,8 @@ def parse_args() -> argparse.Namespace:

def parse_gff(gff_file):
"""
Summary: Import GFF3 file and convert to pandas DataFrame.
Import GFF3 file and convert to pandas DataFrame.
The GFF3 file is imported into a dataframe and then all the attributes
in the attributes column are split into seperate columns.
It then drops many of the additional fields from the attributes column
Expand Down Expand Up @@ -186,19 +191,27 @@ def parse_gff(gff_file):
return transcripts_df


def parse_annotation_tsv(path, gff_transcripts_df):
def parse_annotation_tsv(path: str,
gff_transcripts_df: pd.DataFrame) -> tuple[pd.DataFrame,
pd.DataFrame]:
"""
_summary_
Parse an annotation TSV file and separate it into dataframes for HGNC IDs,
Transcript IDs, and Coordinates, then merge them with a GFF dataframe.
Parameters
----------
path : _type_
_description_
path : str
The file path to the TSV annotation file.
gff_transcripts_df : pd.DataFrame
A dataframe containing GFF information including transcript IDs.
Returns
-------
annotation_dataframe for HGNC_ids and transcripts.
coordinated dataframe for coordinates to be appended to bed later.
Tuple[pd.DataFrame, pd.DataFrame]
A tuple containing two dataframes:
1. The merged dataframe for HGNC IDs and transcripts. (hgnc_merged_df)
2. The coordinated dataframe for coordinates to be appended
to a BED file later (coordinates_df).
"""
df = pd.read_csv(path, sep="\t")
# Create masks for HGNC, Transcript, and Coordinates dataframes
Expand Down Expand Up @@ -271,7 +284,7 @@ def parse_annotation_tsv(path, gff_transcripts_df):
else:
print("All Transcript rows were merged successfully")
# Concatenate the merged dataframes
final_df = pd.concat([merged_hgnc_df, merged_transcript_df])
hgnc_merged_df = pd.concat([merged_hgnc_df, merged_transcript_df])

# Coordinates dataframe split into columns
# Split the "Coordinates" column by ':' and '-'
Expand All @@ -287,10 +300,10 @@ def parse_annotation_tsv(path, gff_transcripts_df):
["chromosome", "start", "end", "annotation", "gene"]
]

return final_df, coordinates_df
return hgnc_merged_df, coordinates_df


def extract_hgnc_id(dbxref_str):
def extract_hgnc_id(dbxref_str: str):
"""
Wrapper function to extract HGNC ID from a string of dbxrefs.
Expand All @@ -314,14 +327,14 @@ def extract_hgnc_id(dbxref_str):
return None


def read_assembly_mapping(assembly_file):
def read_assembly_mapping(assembly_file: str):
"""
Reads in the associated assembly file and returns a dictionary mapping
to find chromosome for each refseq accession.
Parameters
----------
assembly_file : tsv
assembly_file : str (file path to tsv)
found at: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.25_GRCh37.p13/
Returns
Expand All @@ -339,7 +352,7 @@ def read_assembly_mapping(assembly_file):
return accession_to_chromosome


def map_accession_to_chromosome(accession, accession_to_chromosome):
def map_accession_to_chromosome(accession: str, accession_to_chromosome: dict):
"""
Simple mapping function to find chromosome for a given refseq accession.
Calls the accession_to_chromosome dictionary and extracts the chromosome.
Expand All @@ -360,13 +373,13 @@ def map_accession_to_chromosome(accession, accession_to_chromosome):
return accession_to_chromosome.get(accession, f"Unknown - {accession}")


def parse_pickle(pickle_file):
def parse_pickle(pickle_file: str):
"""
Parses a pickle file and returns a DataFrame of transcripts.
Parameters
----------
pickle_file : pkl
pickle_file : str (path to Pickle file)
pickle file of a GFF DataFrame once parsed
with columns from attributes_to_columns
Expand All @@ -381,7 +394,7 @@ def parse_pickle(pickle_file):
return transcripts_df


def merge_overlapping(bed_df):
def merge_overlapping(bed_df: pd.DataFrame):
"""
Function to merge overlapping regions in a bed file by annotation.
Expand Down Expand Up @@ -428,7 +441,7 @@ def merge_overlapping(bed_df):
return merged_df


def config_igv_report(args):
def config_igv_report(args: argparse.Namespace):
"""
Function to call igv report script with the correct parameters.
Generates an IGV html report using generic handling.
Expand Down Expand Up @@ -466,19 +479,25 @@ def config_igv_report(args):
print("IGV report created successfully!")


def write_bed(annotation_df, coordinates_df, args) -> None:
def write_bed(annotation_df: pd.DataFrame,
coordinates_df: pd.DataFrame,
args: argparse.Namespace) -> None:
"""
Combines dataframes, extracts chromosome for HGNC_ids,
and writes to MAF & BED file for IGV visualisation and VEP annotation.
Parameters
----------
annotation_df : _type_
_description_
coordinates_df : _type_
_description_
args : _type_
_description_
annotation_df : pd.DataFrame
A dataframe containing annotation information.
coordinates_df : pd.DataFrame
A dataframe containing coordinates information.
args : Namespace
A namespace containing command-line arguments and options.
Returns
-------
None
"""
# Create BED file with flanking regions
print("Creating BED file")
Expand Down
16 changes: 10 additions & 6 deletions igv_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import subprocess


def create_igv_report(bed_file, maf_file, genome, reference_file, info_columns, title, output_file):
def create_igv_report(bed_file: str, maf_file: str,
genome: str, reference_file: str,
info_columns: list, title: str,
output_file: str) -> None:
"""
Create an IGV report from a bed file.
Parameters
Expand All @@ -20,8 +23,9 @@ def create_igv_report(bed_file, maf_file, genome, reference_file, info_columns,
Returns
-------
None.
Prints the standard output and error of the subprocess.
Creates an IGV report.
Creates an IGV report file.
"""
tracks_config = [
{
Expand Down Expand Up @@ -87,8 +91,8 @@ def create_igv_report(bed_file, maf_file, genome, reference_file, info_columns,


if __name__ == "__main__":
bed_file = "data/test.maflite.maf"
genome = "hg19"
title = f"TEST"
output = "test.html"
bed_file_str = "data/test.maflite.maf"
genome_str = "hg19"
title_str = f"TEST"
output_str = "test.html"
create_igv_report(bed_file, genome, title, output)

0 comments on commit d3fdfb2

Please sign in to comment.