+ make the installation process easier (attending #16 request)

+ starting to implement better memory management (cached wordlists writing and reading i/o files), not working yet (related to #12 issue) + updating and fixing minor bugs related to dependencies (maybe related to #14 issue) + 'exclude from other wordlists' feature removed
r3nt0n · Jul 26, 2022 · 7accca3 · 7accca3
1 parent 34fb3fd
commit 7accca3
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Packaging status](https://repology.org/badge/tiny-repos/bopscrk.svg)](https://repology.org/project/bopscrk/versions)
 ![[GPL-3.0 License](https://github.com/r3nt0n)](https://img.shields.io/badge/license-GPL%203.0-brightgreen.svg)
 ![[Python 3.8](https://github.com/r3nt0n)](http://img.shields.io/badge/python-3.8-blue.svg)
-![[Version 2.3.1](https://github.com/r3nt0n)](http://img.shields.io/badge/version-2.3.1-orange.svg)
+![[Version 2.4](https://github.com/r3nt0n)](http://img.shields.io/badge/version-2.4-orange.svg)
 
 
 
@@ -24,8 +24,18 @@ Included in **<a href="https://blackarch.org/">BlackArch Linux</a>** pentesting
 
 ## Requirements
 + **Python 3** (secondary branch keeps Python 2.7 legacy support)
-+ *optional* - to use `lyricpass` module:   
-    `pip install requirements.txt`
++ requests python module
+
+## Get started
+### Download and install
+```
+git clone --recurse-submodules https://github.com/r3nt0n/bopscrk
+pip install -r requirements.txt
+```
+### Run interactive mode
+```
+python3 bopscrk.py -i
+```
 
 ## Usage
 ```
@@ -39,8 +49,6 @@ Included in **<a href="https://blackarch.org/">BlackArch Linux</a>** pentesting
   -l, --leet         enable leet transformations
   -n                 max amount of words to combine each time (default: 2)
   -a , --artists     artists to search song lyrics (comma-separated)
-  -x , --exclude     exclude all the words included in other wordlists
-                     (several wordlists should be comma-separated)
   -o , --output      output file to save the wordlist (default: tmp.txt)
   -C , --config      specify config file to use (default: ./bopscrk.cfg)
 
@@ -100,10 +108,12 @@ It will retrieve all lyrics from all songs which belongs to artists that you pro
     `leet_charset=a:4 e:3`      
 
 
-#### Weighted-words system
-[...] Coming soon [...]
-
 ## Changelist
++ `2.4 version notes (26/7/2022`
+  + make the installation process easier
+  + starting to implement better memory management (cached wordlists writing and reading i/o files), not working yet
+  + updating and fixing minor bugs related to dependencies
+  + 'exclude from other wordlists' feature removed 
 + `2.3.1 version notes`
   + fixing namespace bug (related to aux.py module, renamed to auxiliars.py) when running on windows systems
   + **unittest** (and simple unitary tests for transforms, excluders and combinators functions) **implemented**.
@@ -134,7 +144,6 @@ It will retrieve all lyrics from all songs which belongs to artists that you pro
 
 
 ## TO-DO list
-+ Implement **weighted-words system**.
 + Create options to **custom case transforms** (e.g.: disable pair/odd transforms).
 + **Lyricpass** integration was upgraded to last version released by initstring, but still needs some tweaks to speed up the search process (I would appreciate any help).
 

diff --git a/bopscrk.py b/bopscrk.py
@@ -9,7 +9,7 @@
 
 name =  'bopscrk.py'
 __author__ = 'r3nt0n'
-__version__ = '2.3.1'
+__version__ = '2.4'
 __status__ = 'Development'
 
 

diff --git a/modules/args.py b/modules/args.py
@@ -41,10 +41,10 @@ def __init__(self):
                             dest='artists', default=False,
                             help='artists to search song lyrics (comma-separated)')
 
-        parser.add_argument('-x', '--exclude', action="store", metavar='', type=str,
-                            dest='exclude', default=False,
-                            help='exclude all the words included in other wordlists '
-                                 '(several wordlists should be comma-separated)')
+        # parser.add_argument('-x', '--exclude', action="store", metavar='', type=str,
+        #                     dest='exclude', default=False,
+        #                     help='exclude all the words included in other wordlists '
+        #                          '(several wordlists should be comma-separated)')
 
         parser.add_argument('-o', '--output', action="store", metavar='', type=str,
                             dest='outfile', default=self.DEFAULT_OUTPUT_FILE,
@@ -96,6 +96,12 @@ def set_interactive_options(self):
             else:
                 break
 
+        self.artists = input('  {}[?]{} Artist names to search song lyrics (comma-separated) >>> '.format(color.BLUE, color.END))
+        if is_empty(self.artists):
+            self.artists = False
+        else:
+            self.artists = self.artists.split(',')
+
         others = input('  {}[?]{} Some other relevant words (comma-separated) >>> '.format(color.BLUE, color.END))
 
         leet = input('  {}[?]{} Do yo want to make leet transforms? [y/n] >>> '.format(color.BLUE, color.END))
@@ -122,24 +128,20 @@ def set_interactive_options(self):
                 except ValueError:
                     print('  {}[!]{} Should be an integer'.format(color.RED, color.END))
 
-        self.artists = input('  {}[?]{} Artist names to search song lyrics (comma-separated) >>> '.format(color.BLUE, color.END))
-        if is_empty(self.artists): self.artists = False
-        else: self.artists = self.artists.split(',')
-
-        while True:
-            exclude = input('  {}[?]{} Exclude words from other wordlists? >>> '.format(color.BLUE, color.END))
-            if is_empty(exclude):
-                self.exclude_wordlists = False; break
-            else:
-                exclude = exclude.split(',')
-                valid_paths = True
-                for wl_path in exclude:
-                    if not os.path.isfile(wl_path):
-                        valid_paths = False
-                        print('  {}[!]{} {} not found'.format(color.RED, color.END, wl_path))
-                if valid_paths:
-                    self.exclude_wordlists = exclude
-                    break
+        # while True:
+        #     exclude = input('  {}[?]{} Exclude words from other wordlists? >>> '.format(color.BLUE, color.END))
+        #     if is_empty(exclude):
+        #         self.exclude_wordlists = False; break
+        #     else:
+        #         exclude = exclude.split(',')
+        #         valid_paths = True
+        #         for wl_path in exclude:
+        #             if not os.path.isfile(wl_path):
+        #                 valid_paths = False
+        #                 print('  {}[!]{} {} not found'.format(color.RED, color.END, wl_path))
+        #         if valid_paths:
+        #             self.exclude_wordlists = exclude
+        #             break
 
         self.outfile = input('  {}[?]{} Output file [{}] >>> '.format(color.BLUE, color.END, self.DEFAULT_OUTPUT_FILE))
         if is_empty(self.outfile): self.outfile = self.DEFAULT_OUTPUT_FILE
@@ -178,12 +180,12 @@ def set_cli_options(self):
         self.n_words = self.args.n_words
         self.artists = self.args.artists
         self.outfile = self.args.outfile
-        self.exclude_wordlists = self.args.exclude
-        if self.exclude_wordlists:
-            self.exclude_wordlists = self.exclude_wordlists.split(',')
-            for wl_path in self.exclude_wordlists:
-                if not os.path.isfile(wl_path):
-                    print('  {}[!]{} {} not found'.format(color.RED, color.END, wl_path))
-                    sys.exit(4)
+        # self.exclude_wordlists = self.args.exclude
+        # if self.exclude_wordlists:
+        #     self.exclude_wordlists = self.exclude_wordlists.split(',')
+        #     for wl_path in self.exclude_wordlists:
+        #         if not os.path.isfile(wl_path):
+        #             print('  {}[!]{} {} not found'.format(color.RED, color.END, wl_path))
+        #             sys.exit(4)
         if self.artists:
             self.artists = self.artists.split(',')
diff --git a/modules/auxiliars.py b/modules/auxiliars.py
@@ -29,4 +29,33 @@ def is_valid_date(date_str):
         datetime.datetime.strptime(date_str, '%d/%m/%Y')
         return True
     except ValueError:
-        return False
+        return False
+
+def append_wordlist_to_file(filepath, wordlist):
+    """
+    Save wordlist into filepath provided (creates it if not exists, add words to the end if exists).
+    :param filepath: path to file
+    :param wordlist: list of words to save
+    :return: True or False
+    """
+    try:
+        with open(filepath, 'a') as f:
+            for word in wordlist:
+                f.write(word + '\n')
+        return True
+    except:
+        return False
+
+
+def remove_duplicates_from_file(infile_path, outfile_path="temp.000000000.bopscrk"):
+    lines_seen = set()  # holds lines already seen
+    outfile = open(outfile_path, "w")
+    infile = open(infile_path, "r")
+    for line in infile:
+        if line not in lines_seen:  # not a duplicate
+            outfile.write(line)
+            lines_seen.add(line)
+    outfile.close()
+    infile.close()
+    os.remove(infile_path)
+    os.rename(outfile_path, infile_path)
diff --git a/modules/main.py b/modules/main.py
@@ -6,10 +6,10 @@
 import sys, os, datetime
 
 from bopscrk import name, __version__, __author__, args, Config
-from modules.auxiliars import clear
+from modules.auxiliars import clear, remove_duplicates_from_file
 from modules import banners
 from modules.color import color
-from modules.transforms import leet_transforms, case_transforms, artist_space_transforms, lyric_space_transforms, multithread_transforms, take_initials
+from modules.transforms import leet_transforms, case_transforms, artist_space_transforms, lyric_space_transforms, multithread_transforms, take_initials, transform_cached_wordlist_and_save
 from modules.combinators import combinator, add_common_separators
 from modules.excluders import remove_by_lengths, remove_duplicates, multithread_exclude
 
@@ -59,7 +59,7 @@ def run():
                 # Search lyrics if it meets dependencies for lyricpass
                 try:
                     from modules.lyricpass import lyricpass
-                    print('\n{}     -- Starting lyricpass module (by initstring) --\n'.format(color.GREY))
+                    print('\n{}     -- Starting lyricpass module --\n'.format(color.GREY))
                     print('  {}[*]{} Looking for {}\'s lyrics...'.format(color.CYAN, color.END, artist.title()))
                     lyrics = lyricpass.lyricpass(artist)
                     #lyrics = [s.decode("utf-8") for s in lyfinder.lyrics]
@@ -73,7 +73,7 @@ def run():
 
                     # Add the phrases to BASE wordlist
                     lyrics = remove_by_lengths(lyrics, args.min_length, args.max_length)
-                    print('  {}[+]{} Removing by min and max length range ({} phrases remain)...'.format(color.BLUE, color.END,len(lyrics)))
+                    print('  {}[+]{} Adding raw phrases filtering by min and max length range ({} phrases remain)...'.format(color.BLUE, color.END,len(lyrics)))
                     final_wordlist += lyrics
 
                     # Take just the initials on each phrase and add as a new word to FINAL wordlist
@@ -121,6 +121,24 @@ def run():
         # (!) Check for duplicates (is checked before return in combinator() and add_common_separators())
         #final_wordlist = remove_duplicates(final_wordlist)
 
+
+        # # CASE TRANSFORMS
+        # if args.case:
+        #     print('  {}[+]{} Applying case transforms to {} words...'.format(color.BLUE, color.END, len(final_wordlist)))
+        #
+        #     # transform_cached_wordlist_and_save(case_transforms, args.outfile) # not working yet, infinite loop ?¿?¿
+        #     temp_wordlist = []
+        #     temp_wordlist += multithread_transforms(case_transforms, final_wordlist)
+        #     final_wordlist += temp_wordlist
+        #
+        # final_wordlist = remove_duplicates(final_wordlist)
+        #
+        # # SAVE WORDLIST TO FILE BEFORE LEET TRANSFORMS
+        # ############################################################################
+        # with open(args.outfile, 'w') as f:
+        #     for word in final_wordlist:
+        #         f.write(word + '\n')
+
         # LEET TRANSFORMS
         if args.leet:
             if not Config.LEET_CHARSET:
@@ -135,36 +153,44 @@ def run():
                           '      could take several minutes{}\n'.format(color.ORANGE,color.END,args.max_length,color.ORANGE,color.END,len(final_wordlist),color.ORANGE,color.END))
                     recursive_msg = '{}recursive{} '.format(color.RED,color.END)
                 print('  {}[+]{} Applying {}leet transforms to {} words...'.format(color.BLUE, color.END, recursive_msg,len(final_wordlist)))
-                #print(final_wordlist)
+
+                #transform_cached_wordlist_and_save(leet_transforms, args.outfile)
+                #remove_duplicates_from_file(args.outfile)
+
                 temp_wordlist = []
                 temp_wordlist += multithread_transforms(leet_transforms, final_wordlist)
                 final_wordlist += temp_wordlist
 
         # CASE TRANSFORMS
         if args.case:
-            print('  {}[+]{} Applying case transforms to {} words...'.format(color.BLUE, color.END, len(final_wordlist)))
+            print('  {}[+]{} Applying case transforms to {} words...'.format(color.BLUE, color.END,len(final_wordlist)))
+
+            # transform_cached_wordlist_and_save(case_transforms, args.outfile) # not working yet, infinite loop ?¿?¿
+
             temp_wordlist = []
             temp_wordlist += multithread_transforms(case_transforms, final_wordlist)
             final_wordlist += temp_wordlist
 
+        final_wordlist = remove_duplicates(final_wordlist)
+
         # EXCLUDE FROM OTHER WORDLISTS
-        if args.exclude_wordlists:
+        #if args.exclude_wordlists:
             # For each path to wordlist provided
-            for wl_path in args.exclude_wordlists:
-                print('  {}[+]{} Excluding wordlist {}...'.format(color.BLUE, color.END, os.path.basename(wl_path)))
-                # Open the file
-                with open(wl_path, 'r') as x_wordlist_file:
-                    # Read line by line in a loop
-                    while True:
-                        word_to_exclude = x_wordlist_file.readline()
-                        if not word_to_exclude: break  # breaks the loop when file ends
-                        final_wordlist = multithread_exclude(word_to_exclude, final_wordlist)
+            # for wl_path in args.exclude_wordlists:
+            #     print('  {}[+]{} Excluding wordlist {}...'.format(color.BLUE, color.END, os.path.basename(wl_path)))
+            #     # Open the file
+            #     with open(wl_path, 'r') as x_wordlist_file:
+            #         # Read line by line in a loop
+            #         while True:
+            #             word_to_exclude = x_wordlist_file.readline()
+            #             if not word_to_exclude: break  # breaks the loop when file ends
+            #             final_wordlist = multithread_exclude(word_to_exclude, final_wordlist)
 
         # re-check for duplicates
-        final_wordlist = remove_duplicates(final_wordlist)
+        #final_wordlist = remove_duplicates(final_wordlist)
 
         # SAVE WORDLIST TO FILE
-        ############################################################################
+        ###########################################################################
         with open(args.outfile, 'w') as f:
             for word in final_wordlist:
                 f.write(word + '\n')
@@ -178,7 +204,8 @@ def run():
         ############################################################################
         print('\n  {}[+]{} Time elapsed:\t{}'.format(color.GREEN, color.END, total_time))
         print('  {}[+]{} Output file:\t{}{}{}{}'.format(color.GREEN, color.END, color.BOLD, color.BLUE, args.outfile, color.END))
-        print('  {}[+]{} Words generated:\t{}{}{}\n'.format(color.GREEN, color.END, color.RED, len(final_wordlist), color.END))
+        #print('  {}[+]{} Words generated:\t{}{}{}\n'.format(color.GREEN, color.END, color.RED, str(sum(1 for line in open(args.outfile))), color.END))
+        print('  {}[+]{} Words generated:\t{}{}{}\n'.format(color.GREEN, color.END, color.RED,len(final_wordlist), color.END))
         sys.exit(0)
 
     except KeyboardInterrupt:

diff --git a/modules/transforms.py b/modules/transforms.py
@@ -7,6 +7,7 @@
 
 from bopscrk import Config
 from modules.excluders import remove_duplicates
+from modules.auxiliars import append_wordlist_to_file
 
 
 def case_transforms(word):
@@ -143,4 +144,40 @@ def multithread_transforms(transform_type, wordlist):
     return new_wordlist
 
 
+def transform_cached_wordlist_and_save(transform_type, filepath):
+
+    last_position = 0
+
+    while True:
+
+        cached_wordlist = []
+        new_wordlist = []
+
+        with open(filepath, 'r', encoding='utf-8') as f:
+            counter = 0
+            f.seek(last_position)  # put point on last position
+            while True:
+                line = f.readline()
+                if counter >= 8000:
+                    last_position = f.tell()  # save last_position and break inner loop
+                    break
+                if not line:
+                    break
+                if line.strip() not in cached_wordlist:
+                    cached_wordlist.append(line.strip())
+                counter += 1
+                last_position = f.tell()  # save last_position
+
+        new_wordlist += multithread_transforms(transform_type, cached_wordlist)
+        #cached_wordlist += new_wordlist
+        append_wordlist_to_file(filepath, new_wordlist)
+
+        # read again the file to check if it ended
+        with open(filepath, 'r', encoding='utf-8') as f:
+            f.seek(last_position)  # put point on last position
+            line = f.readline()
+            if not line:
+                break
+
+
 
diff --git a/tests/bopscrk.cfg b/tests/bopscrk.cfg
@@ -27,7 +27,7 @@ separators_strings=123 xXx !!
 # LEET REPLACEMENT CHARSET
 # characters to replace and correspondent substitute in leet transforms
 leet_charset=a:4 e:3 i:1 o:0 s:$ t:7
-# Uncomment the following line to get an extensive charset
+# Uncomment the following line to get an extensive (and expensive) charset
 # leet_charset=a:4 a:@ e:3 i:1 i:! i:¡ l:1 o:0 s:$ s:5 b:8 t:7 c:(
 
 # RECURSIVE LEET TRANSFORMS - Enables a recursive call to leet_transforms() function