Initial commit

bitquark · Mar 1, 2016 · df69794 · df69794
commit df69794
Show file tree

Hide file tree

Showing 12 changed files with 2,222,126 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Jon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,15 @@
+# DNSpop
+
+Tools to find popular trends by analysis of DNS data. For more information, see my blog post on [the most popular subdomains on the internet](https://bitquark.co.uk/blog/2016/02/29/the_most_popular_subdomains_on_the_internet). Hit the results directory to get straight to the data.
+
+## code/subpop.sh
+
+A script to build a list of popular subdomains based on Rapid7's Project Sonar [Forward DNS](https://github.com/rapid7/sonar/wiki/Forward-DNS) data set.
+
+## code/suffix_strip.py
+
+A script to efficiently strip suffixes from domains using data from the [Public Suffix List](https://publicsuffix.org/list/). Used by _subpop.sh_ but can be used as a stand-alone script.
+
+## results/*
+
+Result sets from the above tools.
diff --git a/code/subpop.sh b/code/subpop.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Build a list of popular subdomains based on Rapid7's Project Sonar Forward DNS data set
+# For background see: https://bitquark.co.uk/blog/2016/02/29/the_most_popular_subdomains_on_the_internet
+# DNS data from: https://github.com/rapid7/sonar/wiki/Forward-DNS
+# Public suffix data from: https://publicsuffix.org/
+
+# Hello!
+echo "Subpop - (c)oded 2015-∞ Jon - bitquark.co.uk"
+
+# Find the date of the most recent DNS data set
+DATA_DATE=`ls -r1 ????????_dnsrecords_all 2>/dev/null | head -1 | cut -d _ -f 1`
+if [ -z $DATA_DATE ]; then
+	echo "[!] No DNS data found. You can download the latest data set from: https://scans.io/study/sonar.fdns"
+	exit
+fi
+echo ".oO( Using DNS data from $DATA_DATE )"
+
+# Retrieve suffixes from the publicsuffix.org list
+echo ".oO( Updating public suffix list )"
+curl -s https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat > public_suffix_list.dat
+
+# Speeds things up considerably, and should be fine working with subdomains
+# For info, see http://www.inmotionhosting.com/support/website/ssh/speed-up-grep-searches-with-lc-all
+export LC_ALL=C
+
+# The domain list contains records of all types (e.g. mx, txt, cname), let's de-dupe the list.
+# With the 2015-06-06 data set this gets us from 1,421,085,853 (68G) records to 523,039,450 (13G)
+# There are about 1.6k records that a 'sort -u' would remove but it's not really worth the extra processing time
+echo ".oO( De-duping DNS records )"
+pv $DATA_DATE"_dnsrecords_all" | cut -d , -f 1 | uniq > $DATA_DATE"_domains_with_tld"
+
+# Strip TLDs from domains
+# Some TLDs use extended characters, so we unset LC_ALL here
+echo ".oO( Stripping TLDs )"
+pv $DATA_DATE"_domains_with_tld" | LC_ALL= ./suffix_strip.py | grep '\.' | uniq > $DATA_DATE"_domains_without_tld"
+
+# Take the left-most subdomain from each record and combine with the domain before running uniq to
+# make sure that subdomains only get counted once per domain, then trim off the domain part
+echo ".oO( Extracting subdomains )"
+pv $DATA_DATE"_domains_without_tld" | awk -F . '{print $1 ":" $NF}' | uniq | cut -d : -f 1 | sort > $DATA_DATE"_subdomains_raw"
+
+# Do the final tally of popular domains
+echo ".oO( Tallying up and sorting by subdomain popularity )"
+pv $DATA_DATE"_subdomains_raw" | uniq -c | sort -rn > $DATA_DATE"_subdomains_popular"
+
+# Fin
+echo "All done! Popular subdomains are in "$DATA_DATE"_subdomains_popular"
+
diff --git a/code/suffix_strip.py b/code/suffix_strip.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+# Strip suffixes from domains using data from the Public Suffix List
+# For background see: https://bitquark.co.uk/blog/2016/02/29/the_most_popular_subdomains_on_the_internet
+# (c)oded 2015-∞ Jon - bitquark.co.uk
+
+import sys
+import os.path
+import multiprocessing
+
+def strip_domain(domain):
+    """ Strip a domain of its suffix """
+    domain = domain.rstrip()
+    for suffix in suffixes:
+        if domain.endswith(suffix):
+            return domain[:-len(suffix)]
+
+# Check that the public suffix list exists
+if not os.path.isfile('public_suffix_list.dat'):
+    sys.exit('[!] No suffix list found. You can download the latest copy from: https://publicsuffix.org/list/')
+
+# Build a list of domain suffixes using the public suffix list from publicsuffix.org
+# Note that the file is read backwards to prevent, .uk superceding .co.uk, for example
+with open('public_suffix_list.dat') as fh:
+    public_suffixes = [('.' + line.replace('*.', '')) for line in reversed(fh.readlines()) if line[0:2] != '//' and line[0] != '!' and line != '\n']
+
+# Domains with > 400k records in the 2016-02-13 Project Sonar Forward DNS data set and which
+# don't supercede sub-TLD parts (e.g. .jp is excluded because of .ne.jp, .co.jp, etc)
+common_suffixes = [ '.com', '.net', '.ne.jp', '.de', '.org', '.edu', '.nl', '.info', '.biz', '.co.uk', '.cz', '.dk',
+                    '.com.cn', '.mil', '.ac.uk', '.ch', '.eu', '.com.br', '.co.za', '.ad.jp', '.ac.cn', '.com.au',
+                    '.or.jp', '.net.au', '.asia', '.ac.jp', '.mobi', '.co.jp', '.sk', '.edu.tw', '.net.pl', '.gov' ]
+
+# Create the suffix list
+suffixes = common_suffixes + [_.rstrip() for _ in public_suffixes if _.rstrip() not in common_suffixes]
+
+# Create a multiprocessing pool and iterate over domains
+pool = multiprocessing.Pool()
+with sys.stdin as fh:
+    for domain in pool.imap(strip_domain, fh, 1024):
+        print(domain)
+