Skip to content

Commit 528dbe2

Browse files
committed
update on paper:
1 parent 35aa7e6 commit 528dbe2

File tree

2 files changed

+40
-59
lines changed

2 files changed

+40
-59
lines changed

paper.bib

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,6 @@ @article{yin_encoding_2018
1515
pages = {143--151},
1616
}
1717

18-
@article{vinga_pattern_2012,
19-
title = {Pattern matching through {Chaos} {Game} {Representation}: bridging numerical and discrete data structures for biological sequence analysis},
20-
volume = {7},
21-
issn = {1748-7188},
22-
shorttitle = {Pattern matching through {Chaos} {Game} {Representation}},
23-
url = {https://doi.org/10.1186/1748-7188-7-10},
24-
doi = {10.1186/1748-7188-7-10},
25-
abstract = {Chaos Game Representation (CGR) is an iterated function that bijectively maps discrete sequences into a continuous domain. As a result, discrete sequences can be object of statistical and topological analyses otherwise reserved to numerical systems. Characteristically, CGR coordinates of substrings sharing an L-long suffix will be located within 2 -Ldistance of each other. In the two decades since its original proposal, CGR has been generalized beyond its original focus on genomic sequences and has been successfully applied to a wide range of problems in bioinformatics. This report explores the possibility that it can be further extended to approach algorithms that rely on discrete, graph-based representations.},
26-
number = {1},
27-
urldate = {2020-05-20},
28-
journal = {Algorithms for Molecular Biology},
29-
author = {Vinga, Susana and Carvalho, Alexandra M. and Francisco, Alexandre P. and Russo, Luís MS and Almeida, Jonas S.},
30-
month = may,
31-
year = {2012},
32-
pages = {10},
33-
}
34-
3518
@article{jeffrey_chaos_1990,
3619
title = {Chaos game representation of gene structure},
3720
volume = {18},
@@ -49,38 +32,40 @@ @article{jeffrey_chaos_1990
4932
pages = {2163--2170},
5033
}
5134

52-
@article{almeida_analysis_2001,
53-
title = {Analysis of genomic sequences by {Chaos} {Game} {Representation}},
54-
volume = {17},
55-
issn = {1367-4803},
56-
doi = {10.1093/bioinformatics/17.5.429},
57-
abstract = {MOTIVATION: Chaos Game Representation (CGR) is an iterative mapping technique that processes sequences of units, such as nucleotides in a DNA sequence or amino acids in a protein, in order to find the coordinates for their position in a continuous space. This distribution of positions has two properties: it is unique, and the source sequence can be recovered from the coordinates such that distance between positions measures similarity between the corresponding sequences. The possibility of using the latter property to identify succession schemes have been entirely overlooked in previous studies which raises the possibility that CGR may be upgraded from a mere representation technique to a sequence modeling tool.
58-
RESULTS: The distribution of positions in the CGR plane were shown to be a generalization of Markov chain probability tables that accommodates non-integer orders. Therefore, Markov models are particular cases of CGR models rather than the reverse, as currently accepted. In addition, the CGR generalization has both practical (computational efficiency) and fundamental (scale independence) advantages. These results are illustrated by using Escherichia coli K-12 as a test data-set, in particular, the genes thrA, thrB and thrC of the threonine operon.},
35+
36+
@article{greener_guide_2022,
37+
title = {A guide to machine learning for biologists},
38+
volume = {23},
39+
issn = {1471-0080},
40+
doi = {10.1038/s41580-021-00407-0},
41+
abstract = {The expanding scale and inherent complexity of biological data have encouraged a growing use of machine learning in biology to build informative and predictive models of the underlying biological processes. All machine learning techniques fit models to data; however, the specific methods are quite varied and can at first glance seem bewildering. In this Review, we aim to provide readers with a gentle introduction to a few key machine learning techniques, including the most recently developed and widely used techniques involving deep neural networks. We describe how different techniques may be suited to specific types of biological data, and also discuss some best practices and points to consider when one is embarking on experiments involving machine learning. Some emerging directions in machine learning methodology are also discussed.},
5942
language = {eng},
60-
number = {5},
61-
journal = {Bioinformatics (Oxford, England)},
62-
author = {Almeida, J. S. and Carriço, J. A. and Maretzek, A. and Noble, P. A. and Fletcher, M.},
63-
month = may,
64-
year = {2001},
65-
pmid = {11331237},
66-
pages = {429--437},
43+
number = {1},
44+
journal = {Nature Reviews. Molecular Cell Biology},
45+
author = {Greener, Joe G. and Kandathil, Shaun M. and Moffat, Lewis and Jones, David T.},
46+
month = jan,
47+
year = {2022},
48+
pmid = {34518686},
49+
keywords = {Animals, Biology, Deep Learning, Humans, Machine Learning, Neural Networks, Computer},
50+
pages = {40--55},
6751
}
6852

69-
@article{almeida_biological_2009,
70-
title = {Biological sequences as pictures: a generic two dimensional solution for iterated maps},
71-
volume = {10},
72-
issn = {1471-2105},
73-
shorttitle = {Biological sequences as pictures},
74-
doi = {10.1186/1471-2105-10-100},
75-
abstract = {BACKGROUND: Representing symbolic sequences graphically using iterated maps has enjoyed an enduring popularity since it was first proposed in Jeffrey 1990 as chaos game representation (CGR). The usefulness of this representation goes beyond the convenience of a scale independent representation. It provides a variable memory length representation of transition. This includes the representation of succession with non-integer order, which comes with the promise of generalizing Markovian formalisms. The original proposal targeted genomic sequences only but since then several generalizations have been proposed, many specifically designed to handle protein data.
76-
RESULTS: The challenge of a general solution is that of deriving a bijective transformation of symbolic sequences into bi-dimensional planes. More specifically, it requires the regular fractal nesting of polygons. A first attempt at a general solution was proposed by Fiser 1994 by using non-overlapping circles that contain the polygons. This was used as a starting point to identify a more efficient solution where the encapsulating circles can overlap without the same happening for the sequence maps which are circumscribed to fractal polygon domains.
77-
CONCLUSION: We identified the optimal inscribed packing solution for iterated maps of any Biological sequence, indeed of any symbolic sequence. The new solution maintains the prized bijective mapping property and includes the Sierpinski triangle and the CGR square as particular solutions of the more encompassing formulation.},
78-
language = {eng},
79-
journal = {BMC bioinformatics},
80-
author = {Almeida, Jonas S. and Vinga, Susana},
81-
month = mar,
82-
year = {2009},
83-
pmid = {19335894},
84-
pmcid = {PMC2678093},
85-
pages = {100},
53+
54+
@article{wang_image_2004,
55+
title = {Image quality assessment: from error visibility to structural similarity},
56+
volume = {13},
57+
issn = {1941-0042},
58+
shorttitle = {Image quality assessment},
59+
url = {https://ieeexplore.ieee.org/abstract/document/1284395},
60+
doi = {10.1109/TIP.2003.819861},
61+
abstract = {Objective methods for assessing perceptual image quality traditionally attempted to quantify the visibility of errors (differences) between a distorted image and a reference image using a variety of known properties of the human visual system. Under the assumption that human visual perception is highly adapted for extracting structural information from a scene, we introduce an alternative complementary framework for quality assessment based on the degradation of structural information. As a specific example of this concept, we develop a structural similarity index and demonstrate its promise through a set of intuitive examples, as well as comparison to both subjective ratings and state-of-the-art objective methods on a database of images compressed with JPEG and JPEG2000. A MATLAB implementation of the proposed algorithm is available online at http://www.cns.nyu.edu//spl sim/lcv/ssim/.},
62+
number = {4},
63+
urldate = {2025-07-17},
64+
journal = {IEEE Transactions on Image Processing},
65+
author = {Wang, Zhou and Bovik, A.C. and Sheikh, H.R. and Simoncelli, E.P.},
66+
month = apr,
67+
year = {2004},
68+
keywords = {Data mining, Degradation, Humans, Image quality, Indexes, Layout, Quality assessment, Transform coding, Visual perception, Visual system},
69+
pages = {600--612},
70+
file = {Snapshot:/home/ediman/Zotero/storage/NAMTQ9IJ/1284395.html:text/html},
8671
}

paper.md

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
---
22
title: 'chaoscoder: block-based integer chaos game representation encoding and decoding of DNA sequences'
33
tags:
4+
- Rust
45
- DNA sequence analysis
5-
- Chaos game representation
6-
- DNA encoding
7-
- DNA decoding
6+
- integer Chaos game representation
87
authors:
98
- name: Anicet E. T. Ebou
109
orcid: 0000-0003-4005-177X
@@ -15,22 +14,20 @@ authors:
1514
affiliations:
1615
- name: Equipe Bioinformatique et Biostatistique, Laboratoire de Microbiologie, Biotechnologie et Bioinformatique, Institut National Polytechnique Félix Houphouët-Boigny, BP 1093 Yamoussoukro, Côte d'Ivoire
1716
index: 1
18-
date: 15 July 2025
17+
date: "17 July 2025"
1918
bibliography: paper.bib
2019
---
2120

2221
# Summary
2322

24-
Computational analysis of DNA sequences underpins numerous bioinformatics applications, including sequence classification, genome comparison, mutation detection, and evolutionary studies. These tasks often require transforming symbolic nucleotide sequences (A, T, C, G) into numerical representations suitable for mathematical processing or machine learning.
23+
Computational analysis of DNA sequences underpins numerous bioinformatics applications, including sequence classification, genome comparison, mutation detection, and evolutionary studies. These tasks often require transforming symbolic nucleotide sequences (A, T, C, G) into numerical representations suitable for mathematical processing or machine learning [@greener_guide_2022].
2524

26-
Chaos Game Representation (CGR) is a well-established method that encodes DNA sequences as points in a 2D space, revealing motifs and structural patterns [@jeffrey_chaos_1990]. However, traditional CGR depends on floating-point arithmetic, leading to rounding errors and imprecisionespecially when applied to long sequences or tasks that require exact sequence reconstruction.
25+
Chaos Game Representation (CGR) is a well-established method that encodes DNA sequences as points in a 2D space, revealing motifs and structural patterns [@jeffrey_chaos_1990]. However, traditional CGR depends on floating-point arithmetic, leading to rounding errors and imprecision, especially when applied to long sequences or tasks that require exact sequence reconstruction.
2726

2827
`chaoscoder` implements the Integer Chaos Game Representation (iCGR), a variant that operates entirely in integer space to provide lossless encoding and decoding [@yin_encoding_2018]. To address the exponential scaling limitation of iCGR, the software introduces a block-based variant that divides sequences into overlapping segments, enabling scalable and parallelizable encoding of genome-length sequences.
2928

3029
The software provides a command-line interface for encoding, decoding, visualizing CGRs, and comparing sequence structure via image-based SSIM (Structural Similarity Index Measure). It supports standardized storage of encoded data in a custom `.bicgr` file format, designed for efficient downstream use.
3130

32-
Written in Rust for performance and reliability, `chaoscoder` is well-suited for researchers and developers working with large-scale genomic datasets where precision, reversibility, and scalability are essential.
33-
3431
# Implementation
3532

3633
## Encoding and decoding DNA sequences by integer CGR
@@ -39,7 +36,7 @@ Written in Rust for performance and reliability, `chaoscoder` is well-suited for
3936

4037
## Block-based encoding
4138

42-
Due to the exponential nature of coordinate growth in iCGR, encoding long sequences (e.g., full genomes) directly is computationally infeasible. To mitigate this, `chaoscoder` implements a block-based iCGR approach. Sequences are partitioned into fixed-size, optionally overlapping segments (e.g., 50–100 nt), each of which is independently encoded using the iCGR algorithm (Figure 1).
39+
Due to the exponential nature of coordinate growth in iCGR, encoding long sequences (e.g., full genomes) directly is computationally infeasible. To mitigate this, `chaoscoder` implements a block-based iCGR approach. Sequences are partitioned into fixed-size, overlapping segments (e.g., 50–100 nt), each of which is independently encoded using the iCGR algorithm (Figure 1).
4340

4441
![Workflow of block-based iCGR encoding](chaoscoder.jpg)
4542

@@ -55,8 +52,7 @@ It includes the sequence ID (mandatory), the sequence description (optional), th
5552

5653
## Other features
5754

58-
`chaoscoder` offers additional functionalities to support exploratory and comparative genomics. First, the software can generate 2D CGR images for encoded sequences. Second, users can compute Structural Similarity Index (SSIM) between CGR images to compare sequence patterns without alignment.
59-
Finally, encoding and decoding tasks are multithreaded to improve performance on large datasets.
55+
`chaoscoder` offers additional functionalities to support exploratory and comparative genomics. First, the software can generate 2D CGR images for encoded sequences. Second, users can compute Structural Similarity Index (SSIM) between CGR images to compare sequence patterns without alignment. This feature use the dssim rust library ([docs.rs/dssim/latest/dssim](docs.rs/dssim/latest/dssim)) based on SSIM algorithm from Wang *et al.* [@Wang_image_2004]. Finally, encoding and decoding tasks are multithreaded to improve performance on large datasets.
6056

6157

6258
# Installation

0 commit comments

Comments
 (0)