From 942c86f2506f5b03e16e9cbe739e23b556e62747 Mon Sep 17 00:00:00 2001 From: Mahmoud Parsian Date: Sat, 9 Apr 2016 23:51:40 -0700 Subject: [PATCH] added DNA-Base Counting using External Python function --- tutorial/dna-basecount/README.md | 3 + tutorial/dna-basecount/basemapper.py | 14 +++ tutorial/dna-basecount/dna-basecount3.md | 105 +++++++++++++++++++++++ tutorial/dna-basecount/dna_seq.txt | 2 + 4 files changed, 124 insertions(+) create mode 100755 tutorial/dna-basecount/basemapper.py create mode 100644 tutorial/dna-basecount/dna-basecount3.md create mode 100644 tutorial/dna-basecount/dna_seq.txt diff --git a/tutorial/dna-basecount/README.md b/tutorial/dna-basecount/README.md index 8b038ec..9345c23 100644 --- a/tutorial/dna-basecount/README.md +++ b/tutorial/dna-basecount/README.md @@ -1,4 +1,7 @@ DNA Base Counting ================= * [DNA Base Counting Without In-Mapper Combiner](./dna-basecount.md) + * [DNA Base Counting With In-Mapper Combiner](./dna-basecount2.md) + +* [DNA Base Counting With External Python Function](./dna-basecount3.md) diff --git a/tutorial/dna-basecount/basemapper.py b/tutorial/dna-basecount/basemapper.py new file mode 100755 index 0000000..301e185 --- /dev/null +++ b/tutorial/dna-basecount/basemapper.py @@ -0,0 +1,14 @@ +#!/usr/bin/python + +def mapper(seq): + freq = dict() + for x in list(seq): + if x in freq: + freq[x] +=1 + else: + freq[x] = 1 +# + kv = [(x, freq[x]) for x in freq] + return kv +# +#print mapper("ATCGATCGATAT") diff --git a/tutorial/dna-basecount/dna-basecount3.md b/tutorial/dna-basecount/dna-basecount3.md new file mode 100644 index 0000000..cde5eba --- /dev/null +++ b/tutorial/dna-basecount/dna-basecount3.md @@ -0,0 +1,105 @@ +DNA Base Counting using PySpark +=============================== + +DNA Base Count Definition +------------------------- +[DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html) + +Solution in PySpark +------------------- +This solution assumes that each record is a DNA sequence. +This solution emits a ````(base, 1)```` for every base in +a given sequence and then aggregates all frequencies for +unique bases. For this solution we use an external Python +function defined in ````basemapper.py```` + +* Define Python Function + +```` +$ export SPARK_HOME=/home/mparsian/spark-1.6.1-bin-hadoop2.6 +$ cat $SPARK_HOME/basemapper.py +#!/usr/bin/python + +def mapper(seq): + freq = dict() + for x in list(seq): + if x in freq: + freq[x] +=1 + else: + freq[x] = 1 +# + kv = [(x, freq[x]) for x in freq] + return kv +# +#for testing: +#print mapper("ATCGATCGATAT") +```` +* Define Very Basic Sample Input + +```` +$ cat /home/mparsian/dna_seq.txt +ATATCCCCGGGAT +ATCGATCGATAT +```` + +* Sample PySpark Run + +```` +# ./bin/pyspark +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /__ / .__/\_,_/_/ /_/\_\ version 1.6.1 + /_/ + +SparkContext available as sc, HiveContext available as sqlContext. +>>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt') + +>>> recs.collect() +[ + u'ATATCCCCGGGAT', + u'ATCGATCGATAT' +] + +>>> ones = recs.flatMap(lambda x : [(c,1) for c in list(x)]) +>>> ones.collect() +[ + (u'A', 1), + (u'T', 1), + (u'A', 1), + (u'T', 1), + (u'C', 1), + (u'C', 1), + (u'C', 1), + (u'C', 1), + (u'G', 1), + (u'G', 1), + (u'G', 1), + (u'A', 1), + (u'T', 1), + (u'A', 1), + (u'T', 1), + (u'C', 1), + (u'G', 1), + (u'A', 1), + (u'T', 1), + (u'C', 1), + (u'G', 1), + (u'A', 1), + (u'T', 1), + (u'A', 1), + (u'T', 1) +] +>>> baseCount = rdd.reduceByKey(lambda x,y : x+y) +>>> baseCount.collect() +[ + (u'A', 7), + (u'C', 6), + (u'G', 5), + (u'T', 7) +] +>>> +```` + + diff --git a/tutorial/dna-basecount/dna_seq.txt b/tutorial/dna-basecount/dna_seq.txt new file mode 100644 index 0000000..7c2164e --- /dev/null +++ b/tutorial/dna-basecount/dna_seq.txt @@ -0,0 +1,2 @@ +ATATCCCCGGGAT +ATCGATCGATAT