Added option 'reports_directory' to the CLI for saving reports in the…

… JSON format. (#128) - Some functions optimized; - Added more type hints and auxiliary types; - Added test for check that man util unminimized on system; - Removed repeated logger and constants in the webparsers; - Added pre-commit settings; - Added option to save reports in JSON files while finding similar code parts; - Now docker image pushing in the Docker Hub only when created new tag.
OSLL · Aug 29, 2022 · b32a542 · b32a542
1 parent a270940
commit b32a542
Show file tree

Hide file tree

Showing 30 changed files with 545 additions and 334 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,8 @@
+[flake8]
+max-complexity = 15
+
+ignore =
+    # Line break after binary operator
+    W503, W504
+    # Line too long
+    E501,
diff --git a/.github/workflows/check_n_push_image.yml b/.github/workflows/check_n_push_image.yml
@@ -2,6 +2,8 @@ name: Check source code and push created image based on sources
 
 on:
   push:
+    branches:
+      - main
     paths-ignore:
       - 'docs/**'
       - '**.md'
@@ -24,12 +26,10 @@ jobs:
       with:
         python-version: 3.8
 
-    - name: Lint with flake8
+    - name: Lint with flake8 and isort
       run: |
-        pip install flake8 flake8-bugbear flake8-comprehensions mccabe
-        make substitute-sources
-        flake8 src/ --statistic --max-line-length=80 --max-complexity 15 -qq
-        flake8 test/ --statistic --max-line-length=120 --max-complexity 15 -qq
+        pip install pre-commit==2.20.0
+        make substitute-sources pre-commit
 
   docker-build-test-autotest:
     runs-on: ubuntu-20.04
@@ -65,7 +65,7 @@ jobs:
           if-no-files-found: error
 
       - name: Upload created image
-        if: ${{ github.ref == 'refs/heads/main' }}
+        if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
         uses: actions/upload-artifact@v3
         with:
           name: codeplag-ubuntu20.04
@@ -76,7 +76,7 @@ jobs:
   push-image:
     runs-on: ubuntu-20.04
     needs: [check-code, docker-build-test-autotest]
-    if: github.ref == 'refs/heads/main'
+    if: startsWith(github.event.ref, 'refs/tags/v')
 
     steps:
       - name: Checkout

diff --git a/.gitignore b/.gitignore
@@ -30,7 +30,6 @@ debian/copyright
 
 # Substituting
 src/codeplag/consts.py
-src/webparsers/consts.py
 docker/base_ubuntu2004.dockerfile
 docker/test_ubuntu2004.dockerfile
 docker/ubuntu2004.dockerfile
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+default_language_version:
+    python: python3.8
+repos:
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+        additional_dependencies:
+          - flake8-bugbear==22.8.23
+          - flake8-comprehensions==3.10.0
+          - flake8-simplify==0.19.3
+          - mccabe==0.7.0
diff --git a/Makefile b/Makefile
@@ -1,17 +1,15 @@
-UTIL_VERSION            := 0.2.2
+UTIL_VERSION            := 0.2.3
 UTIL_NAME               := codeplag
 
 BASE_DOCKER_TAG         := $(shell echo $(UTIL_NAME)-base-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
 TEST_DOCKER_TAG         := $(shell echo $(UTIL_NAME)-test-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
 DOCKER_TAG              ?= $(shell echo $(UTIL_NAME)-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
 
 PWD                     := $(shell pwd)
-PYTHONPATH              = $(PWD)/src/
+PYTHONPATH              := $(PWD)/src/
 LOGS_PATH               := /var/log/codeplag
 CODEPLAG_LOG_PATH       := $(LOGS_PATH)/$(UTIL_NAME).log
-WEBPARSERS_LOG_PATH     := $(LOGS_PATH)/webparsers.log
-SOURCE_SUB_FILES        := src/codeplag/consts.py \
-                           src/webparsers/consts.py
+SOURCE_SUB_FILES        := src/codeplag/consts.py
 DEBIAN_SUB_FILES        := debian/changelog \
                            debian/control \
                            debian/preinst \
@@ -30,7 +28,6 @@ all: substitute-sources man install
 	sed \
 		-e "s|@UTIL_NAME@|${UTIL_NAME}|g" \
 		-e "s|@UTIL_VERSION@|${UTIL_VERSION}|g" \
-		-e "s|@WEBPARSERS_LOG_PATH@|${WEBPARSERS_LOG_PATH}|g" \
 		-e "s|@CODEPLAG_LOG_PATH@|${CODEPLAG_LOG_PATH}|g" \
 		-e "s|@PYTHON_REQUIRED_LIBS@|${PYTHON_REQUIRED_LIBS}|g" \
 		-e "s|@LOGS_PATH@|${LOGS_PATH}|g" \
@@ -78,6 +75,9 @@ autotest:
 	pytest test/auto -q
 	make clean-cache
 
+pre-commit:
+	pre-commit run --all-files
+
 clean-cache:
 	find . -maxdepth 1 -type d | grep -E "pytest_cache" | (xargs rm -r 2> /dev/null || exit 0)
 	find . -type d | grep -E "__pycache__" | (xargs rm -r 2> /dev/null || exit 0)
@@ -97,7 +97,6 @@ clean: clean-cache
 
 clean-all: clean
 	rm --force src/codeplag/consts.py
-	rm --force src/webparsers/consts.py
 
 	rm --force docker/base_ubuntu2004.dockerfile
 	rm --force docker/test_ubuntu2004.dockerfile

diff --git a/docker/test_ubuntu2004.dockerfile.in b/docker/test_ubuntu2004.dockerfile.in
@@ -3,7 +3,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 ADD debian/ /usr/src/@UTIL_NAME@/debian
 RUN apt-get install -y debhelper
-RUN pip3 install argparse-manpage==3 pytest==7.1.2
+RUN pip3 install argparse-manpage==3 pytest==7.1.2 pytest-mock==3.8.2
 RUN mkdir -p @LOGS_PATH@
 
 CMD make test
diff --git a/docs/notebooks/utils.py b/docs/notebooks/utils.py
@@ -6,12 +6,13 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from decouple import Config, RepositoryEnv
+from scipy.optimize import curve_fit
+
 from codeplag.algorithms.featurebased import counter_metric, struct_compare
 from codeplag.algorithms.stringbased import gst
 from codeplag.algorithms.tokenbased import value_jakkar_coef
 from codeplag.pyplag.utils import get_ast_from_content, get_features_from_ast
-from decouple import Config, RepositoryEnv
-from scipy.optimize import curve_fit
 from webparsers.github_parser import GitHubParser
 
 
@@ -74,15 +75,15 @@ def get_time_to_meta(df, iterations=10):
     to_meta_time = []
     for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
         print(index, " " * 20, end='\r')
-        for i in range(iterations):
+        for _ in range(iterations):
             tree = get_ast_from_content(content[0], content[1])
             try:
                 start = perf_counter()
-                features1 = get_features_from_ast(tree)
+                get_features_from_ast(tree)
                 end = perf_counter() - start
                 to_meta_time.append(end)
                 count_lines.append(content[2])
-            except:
+            except Exception:
                 break
 
     output = pd.DataFrame(
@@ -115,19 +116,19 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
     if trend == 'linear':
         z = np.polyfit(unique_count_lines, mean_times, 1)
         p = np.poly1d(z)
-        plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Линейный тренд.')
+        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.')
     elif trend == 'n^2':
         popt_cons, _ = curve_fit(square_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100]))
         p = np.poly1d(popt_cons)
-        plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Квадратичный тренд.')
+        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Квадратичный тренд.')
     elif trend == 'n^3':
         popt_cons, _ = curve_fit(cube_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
         p = np.poly1d(popt_cons)
-        plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Кубический тренд.')
+        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Кубический тренд.')
     elif trend == 'n^4':
         popt_cons, _ = curve_fit(quart_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
         p = np.poly1d(popt_cons)
-        plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='n^4.')
+        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')
 
     rolling = pd.DataFrame(
         {
@@ -151,26 +152,26 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'):
     tree1 = get_ast_from_content(work.content, work.link)
     features1 = get_features_from_ast(tree1)
     for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
-        for iteration in range(iterations):
+        for _ in range(iterations):
             print(index, " " * 20, end='\r')
             tree2 = get_ast_from_content(content[0], content[1])
             try:
                 features2 = get_features_from_ast(tree2)
-            except:
+            except Exception:
                 continue
 
             if metric == 'fast':
                 start = perf_counter()
-                jakkar_coef = value_jakkar_coef(features1.tokens, features2.tokens)
-                ops_res = counter_metric(features1.operators, features2.operators)
-                kw_res = counter_metric(features1.keywords, features2.keywords)
-                lits_res = counter_metric(features1.literals, features2.literals)
-                end = perf_counter() - start 
+                value_jakkar_coef(features1.tokens, features2.tokens)
+                counter_metric(features1.operators, features2.operators)
+                counter_metric(features1.keywords, features2.keywords)
+                counter_metric(features1.literals, features2.literals)
+                end = perf_counter() - start
                 times.append(end)
             elif metric == 'gst':
                 start = perf_counter()
                 gst(features1.tokens, features2.tokens, 6)
-                end = perf_counter() - start 
+                end = perf_counter() - start
                 times.append(end)
             elif metric == 'structure':
                 start = perf_counter()

diff --git a/src/codeplag/algorithms/featurebased.py b/src/codeplag/algorithms/featurebased.py
@@ -95,12 +95,10 @@ def get_children_indexes(tree: List[Tuple[int, int]],
     if count_of_nodes != 0:
         current_level = tree[0][0]
 
-    current_index = 0
-    for node in tree:
+    for current_index, node in enumerate(tree):
         if current_level == node[0]:
             indexes.append(current_index)
             count_of_children += 1
-        current_index += 1
 
     return indexes, count_of_children
 

diff --git a/src/codeplag/algorithms/stringbased.py b/src/codeplag/algorithms/stringbased.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import numpy as np
 
 
@@ -20,7 +22,7 @@ def m(symbol1, symbol2):
         '''
         return 0 if symbol1 == symbol2 else 1
 
-    def calculate_distance_matrix(self):
+    def calculate_distance_matrix(self) -> np.int64:
         '''
             The function calculates the Levenshtein matrix and sets
             in the distance atribute minimal count of operations
@@ -55,7 +57,8 @@ def get_similarity_value(self):
         return 1.0 - self.distance / max(self.s1_length, self.s2_length)
 
 
-def is_marked_match(marked_string_list, begin, length):
+def is_marked_match(marked_string_list: List[int],
+                    begin: int, length: int) -> bool:
     """The function returns true if the match consists in
     the marked list, else false.
 
@@ -64,11 +67,12 @@ def is_marked_match(marked_string_list, begin, length):
     @length - length of match
     """
 
-    if begin in marked_string_list or \
-       (begin + length - 1) in marked_string_list:
-        return True
-    else:
-        return False
+    condition = (
+        begin in marked_string_list or
+        (begin + length - 1) in marked_string_list
+    )
+
+    return condition
 
 
 def gst(sequence1, sequence2, min_match_len=6):

diff --git a/src/codeplag/algorithms/tokenbased.py b/src/codeplag/algorithms/tokenbased.py
@@ -8,38 +8,46 @@
 
 
 import math
+from typing import List, Sequence, Set, Tuple, Union
 
 
-def generate_ngrams(tokens, n=3, hashit=False, unique=False):
+def generate_ngrams(tokens: Sequence[int],
+                    n: int = 3,
+                    hashit: bool = False,
+                    unique: bool = False) -> Union[Set[int],
+                                                   List[int],
+                                                   Set[Tuple[int, ...]],
+                                                   List[Tuple[int, ...]]]:
     """The function returns a list or set of N-grams or list or set of hashes
     of ngrams and may use to generate shingles.
 
     @param tokens - list of tokens
-    @param n - count of elements in sequences
+    @param n - count of elements in ngrams
     @param hashit - If is True,
     then the function returns a list or set of hashes of N-grams
     @param unique - If is True,
     then the function returns a set of N-grams or hashes of N-grams
     """
 
+    count_tokens = len(tokens)
     if hashit:
         if unique:
             return {
                 hash(tuple(tokens[i:i + n]))
-                for i in range(len(tokens) - n + 1)
+                for i in range(count_tokens - n + 1)
             }
         return [
             hash(tuple(tokens[i:i + n]))
-            for i in range(len(tokens) - n + 1)
+            for i in range(count_tokens - n + 1)
         ]
 
     if unique:
-        return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)}
+        return {tuple(tokens[i:i + n]) for i in range(count_tokens - n + 1)}
 
-    return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
+    return [tuple(tokens[i:i + n]) for i in range(count_tokens - n + 1)]
 
 
-def get_imprints_from_hashes(hashes):
+def get_imprints_from_hashes(hashes: Sequence[int]) -> List[int]:
     """The function return imprints of the given hashes
 
     @param hashes - list of hashes
@@ -54,14 +62,20 @@ def get_imprints_from_hashes(hashes):
     return [hashes[index] for index in range(0, count_hashes, k)]
 
 
-def value_jakkar_coef(tokens_first, tokens_second, ngrams_length=3):
+def value_jakkar_coef(tokens_first: Sequence[int],
+                      tokens_second: Sequence[int],
+                      ngrams_length: int = 3) -> float:
     '''
         The function returns the value of the Jakkar coefficient
         @param tokens_first - list of tokens of the first program
         @param tokens_second - list of tokens of the second program
     '''
-    ngrams_first = generate_ngrams(tokens_first, ngrams_length, unique=True)
-    ngrams_second = generate_ngrams(tokens_second, ngrams_length, unique=True)
+    ngrams_first: Set[Tuple[int, ...]] = generate_ngrams(
+        tokens_first, ngrams_length, unique=True
+    )
+    ngrams_second: Set[Tuple[int, ...]] = generate_ngrams(
+        tokens_second, ngrams_length, unique=True
+    )
 
     intersection = len(ngrams_first.intersection(ngrams_second))
     union = len(ngrams_first | ngrams_second)
@@ -73,7 +87,7 @@ def value_jakkar_coef(tokens_first, tokens_second, ngrams_length=3):
 
 
 # equal to the Levenshtein length
-def lcs(X, Y):
+def lcs(X: Sequence[int], Y: Sequence[int]) -> int:
     '''
         The function returns the length of the longest common subsequence
         of two sequences X and Y.
@@ -94,15 +108,15 @@ def lcs(X, Y):
         for j in range(n + 1):
             if i == 0 or j == 0:
                 L[i][j] = 0
-            elif X[i-1] == Y[j-1]:
-                L[i][j] = L[i-1][j-1] + 1
+            elif X[i - 1] == Y[j - 1]:
+                L[i][j] = L[i - 1][j - 1] + 1
             else:
-                L[i][j] = max(L[i-1][j], L[i][j-1])
+                L[i][j] = max(L[i - 1][j], L[i][j - 1])
 
     return L[m][n]
 
 
-def lcs_based_coeff(subseq1, subseq2):
+def lcs_based_coeff(subseq1: Sequence[int], subseq2: Sequence[int]) -> float:
     """The function returns coefficient based on the length
     of the longest common subsequence.
     This coefficient describes how same two sequences.