Skip to content

Commit

Permalink
Added option 'reports_directory' to the CLI for saving reports in the…
Browse files Browse the repository at this point in the history
… JSON format. (#128)

- Some functions optimized;
- Added more type hints and auxiliary types;
- Added test for check that man util unminimized on system;
- Removed repeated logger and constants in the webparsers;
- Added pre-commit settings;
- Added option to save reports in JSON files while finding similar code parts;
- Now docker image pushing in the Docker Hub only when created new tag.
  • Loading branch information
Artanias committed Aug 29, 2022
1 parent a270940 commit b32a542
Show file tree
Hide file tree
Showing 30 changed files with 545 additions and 334 deletions.
8 changes: 8 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[flake8]
max-complexity = 15

ignore =
# Line break after binary operator
W503, W504
# Line too long
E501,
14 changes: 7 additions & 7 deletions .github/workflows/check_n_push_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ name: Check source code and push created image based on sources

on:
push:
branches:
- main
paths-ignore:
- 'docs/**'
- '**.md'
Expand All @@ -24,12 +26,10 @@ jobs:
with:
python-version: 3.8

- name: Lint with flake8
- name: Lint with flake8 and isort
run: |
pip install flake8 flake8-bugbear flake8-comprehensions mccabe
make substitute-sources
flake8 src/ --statistic --max-line-length=80 --max-complexity 15 -qq
flake8 test/ --statistic --max-line-length=120 --max-complexity 15 -qq
pip install pre-commit==2.20.0
make substitute-sources pre-commit
docker-build-test-autotest:
runs-on: ubuntu-20.04
Expand Down Expand Up @@ -65,7 +65,7 @@ jobs:
if-no-files-found: error

- name: Upload created image
if: ${{ github.ref == 'refs/heads/main' }}
if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
uses: actions/upload-artifact@v3
with:
name: codeplag-ubuntu20.04
Expand All @@ -76,7 +76,7 @@ jobs:
push-image:
runs-on: ubuntu-20.04
needs: [check-code, docker-build-test-autotest]
if: github.ref == 'refs/heads/main'
if: startsWith(github.event.ref, 'refs/tags/v')

steps:
- name: Checkout
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ debian/copyright

# Substituting
src/codeplag/consts.py
src/webparsers/consts.py
docker/base_ubuntu2004.dockerfile
docker/test_ubuntu2004.dockerfile
docker/ubuntu2004.dockerfile
16 changes: 16 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
default_language_version:
python: python3.8
repos:
- repo: https://github.com/PyCQA/isort
rev: 5.10.1
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
additional_dependencies:
- flake8-bugbear==22.8.23
- flake8-comprehensions==3.10.0
- flake8-simplify==0.19.3
- mccabe==0.7.0
13 changes: 6 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
UTIL_VERSION := 0.2.2
UTIL_VERSION := 0.2.3
UTIL_NAME := codeplag

BASE_DOCKER_TAG := $(shell echo $(UTIL_NAME)-base-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
TEST_DOCKER_TAG := $(shell echo $(UTIL_NAME)-test-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
DOCKER_TAG ?= $(shell echo $(UTIL_NAME)-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)

PWD := $(shell pwd)
PYTHONPATH = $(PWD)/src/
PYTHONPATH := $(PWD)/src/
LOGS_PATH := /var/log/codeplag
CODEPLAG_LOG_PATH := $(LOGS_PATH)/$(UTIL_NAME).log
WEBPARSERS_LOG_PATH := $(LOGS_PATH)/webparsers.log
SOURCE_SUB_FILES := src/codeplag/consts.py \
src/webparsers/consts.py
SOURCE_SUB_FILES := src/codeplag/consts.py
DEBIAN_SUB_FILES := debian/changelog \
debian/control \
debian/preinst \
Expand All @@ -30,7 +28,6 @@ all: substitute-sources man install
sed \
-e "s|@UTIL_NAME@|${UTIL_NAME}|g" \
-e "s|@UTIL_VERSION@|${UTIL_VERSION}|g" \
-e "s|@WEBPARSERS_LOG_PATH@|${WEBPARSERS_LOG_PATH}|g" \
-e "s|@CODEPLAG_LOG_PATH@|${CODEPLAG_LOG_PATH}|g" \
-e "s|@PYTHON_REQUIRED_LIBS@|${PYTHON_REQUIRED_LIBS}|g" \
-e "s|@LOGS_PATH@|${LOGS_PATH}|g" \
Expand Down Expand Up @@ -78,6 +75,9 @@ autotest:
pytest test/auto -q
make clean-cache

pre-commit:
pre-commit run --all-files

clean-cache:
find . -maxdepth 1 -type d | grep -E "pytest_cache" | (xargs rm -r 2> /dev/null || exit 0)
find . -type d | grep -E "__pycache__" | (xargs rm -r 2> /dev/null || exit 0)
Expand All @@ -97,7 +97,6 @@ clean: clean-cache

clean-all: clean
rm --force src/codeplag/consts.py
rm --force src/webparsers/consts.py

rm --force docker/base_ubuntu2004.dockerfile
rm --force docker/test_ubuntu2004.dockerfile
Expand Down
2 changes: 1 addition & 1 deletion docker/test_ubuntu2004.dockerfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ENV DEBIAN_FRONTEND=noninteractive

ADD debian/ /usr/src/@UTIL_NAME@/debian
RUN apt-get install -y debhelper
RUN pip3 install argparse-manpage==3 pytest==7.1.2
RUN pip3 install argparse-manpage==3 pytest==7.1.2 pytest-mock==3.8.2
RUN mkdir -p @LOGS_PATH@

CMD make test
35 changes: 18 additions & 17 deletions docs/notebooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from decouple import Config, RepositoryEnv
from scipy.optimize import curve_fit

from codeplag.algorithms.featurebased import counter_metric, struct_compare
from codeplag.algorithms.stringbased import gst
from codeplag.algorithms.tokenbased import value_jakkar_coef
from codeplag.pyplag.utils import get_ast_from_content, get_features_from_ast
from decouple import Config, RepositoryEnv
from scipy.optimize import curve_fit
from webparsers.github_parser import GitHubParser


Expand Down Expand Up @@ -74,15 +75,15 @@ def get_time_to_meta(df, iterations=10):
to_meta_time = []
for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
print(index, " " * 20, end='\r')
for i in range(iterations):
for _ in range(iterations):
tree = get_ast_from_content(content[0], content[1])
try:
start = perf_counter()
features1 = get_features_from_ast(tree)
get_features_from_ast(tree)
end = perf_counter() - start
to_meta_time.append(end)
count_lines.append(content[2])
except:
except Exception:
break

output = pd.DataFrame(
Expand Down Expand Up @@ -115,19 +116,19 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
if trend == 'linear':
z = np.polyfit(unique_count_lines, mean_times, 1)
p = np.poly1d(z)
plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Линейный тренд.')
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.')
elif trend == 'n^2':
popt_cons, _ = curve_fit(square_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100]))
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Квадратичный тренд.')
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Квадратичный тренд.')
elif trend == 'n^3':
popt_cons, _ = curve_fit(cube_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Кубический тренд.')
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Кубический тренд.')
elif trend == 'n^4':
popt_cons, _ = curve_fit(quart_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='n^4.')
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')

rolling = pd.DataFrame(
{
Expand All @@ -151,26 +152,26 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'):
tree1 = get_ast_from_content(work.content, work.link)
features1 = get_features_from_ast(tree1)
for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
for iteration in range(iterations):
for _ in range(iterations):
print(index, " " * 20, end='\r')
tree2 = get_ast_from_content(content[0], content[1])
try:
features2 = get_features_from_ast(tree2)
except:
except Exception:
continue

if metric == 'fast':
start = perf_counter()
jakkar_coef = value_jakkar_coef(features1.tokens, features2.tokens)
ops_res = counter_metric(features1.operators, features2.operators)
kw_res = counter_metric(features1.keywords, features2.keywords)
lits_res = counter_metric(features1.literals, features2.literals)
end = perf_counter() - start
value_jakkar_coef(features1.tokens, features2.tokens)
counter_metric(features1.operators, features2.operators)
counter_metric(features1.keywords, features2.keywords)
counter_metric(features1.literals, features2.literals)
end = perf_counter() - start
times.append(end)
elif metric == 'gst':
start = perf_counter()
gst(features1.tokens, features2.tokens, 6)
end = perf_counter() - start
end = perf_counter() - start
times.append(end)
elif metric == 'structure':
start = perf_counter()
Expand Down
4 changes: 1 addition & 3 deletions src/codeplag/algorithms/featurebased.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,10 @@ def get_children_indexes(tree: List[Tuple[int, int]],
if count_of_nodes != 0:
current_level = tree[0][0]

current_index = 0
for node in tree:
for current_index, node in enumerate(tree):
if current_level == node[0]:
indexes.append(current_index)
count_of_children += 1
current_index += 1

return indexes, count_of_children

Expand Down
18 changes: 11 additions & 7 deletions src/codeplag/algorithms/stringbased.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import numpy as np


Expand All @@ -20,7 +22,7 @@ def m(symbol1, symbol2):
'''
return 0 if symbol1 == symbol2 else 1

def calculate_distance_matrix(self):
def calculate_distance_matrix(self) -> np.int64:
'''
The function calculates the Levenshtein matrix and sets
in the distance atribute minimal count of operations
Expand Down Expand Up @@ -55,7 +57,8 @@ def get_similarity_value(self):
return 1.0 - self.distance / max(self.s1_length, self.s2_length)


def is_marked_match(marked_string_list, begin, length):
def is_marked_match(marked_string_list: List[int],
begin: int, length: int) -> bool:
"""The function returns true if the match consists in
the marked list, else false.
Expand All @@ -64,11 +67,12 @@ def is_marked_match(marked_string_list, begin, length):
@length - length of match
"""

if begin in marked_string_list or \
(begin + length - 1) in marked_string_list:
return True
else:
return False
condition = (
begin in marked_string_list or
(begin + length - 1) in marked_string_list
)

return condition


def gst(sequence1, sequence2, min_match_len=6):
Expand Down
44 changes: 29 additions & 15 deletions src/codeplag/algorithms/tokenbased.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,46 @@


import math
from typing import List, Sequence, Set, Tuple, Union


def generate_ngrams(tokens, n=3, hashit=False, unique=False):
def generate_ngrams(tokens: Sequence[int],
n: int = 3,
hashit: bool = False,
unique: bool = False) -> Union[Set[int],
List[int],
Set[Tuple[int, ...]],
List[Tuple[int, ...]]]:
"""The function returns a list or set of N-grams or list or set of hashes
of ngrams and may use to generate shingles.
@param tokens - list of tokens
@param n - count of elements in sequences
@param n - count of elements in ngrams
@param hashit - If is True,
then the function returns a list or set of hashes of N-grams
@param unique - If is True,
then the function returns a set of N-grams or hashes of N-grams
"""

count_tokens = len(tokens)
if hashit:
if unique:
return {
hash(tuple(tokens[i:i + n]))
for i in range(len(tokens) - n + 1)
for i in range(count_tokens - n + 1)
}
return [
hash(tuple(tokens[i:i + n]))
for i in range(len(tokens) - n + 1)
for i in range(count_tokens - n + 1)
]

if unique:
return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)}
return {tuple(tokens[i:i + n]) for i in range(count_tokens - n + 1)}

return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
return [tuple(tokens[i:i + n]) for i in range(count_tokens - n + 1)]


def get_imprints_from_hashes(hashes):
def get_imprints_from_hashes(hashes: Sequence[int]) -> List[int]:
"""The function return imprints of the given hashes
@param hashes - list of hashes
Expand All @@ -54,14 +62,20 @@ def get_imprints_from_hashes(hashes):
return [hashes[index] for index in range(0, count_hashes, k)]


def value_jakkar_coef(tokens_first, tokens_second, ngrams_length=3):
def value_jakkar_coef(tokens_first: Sequence[int],
tokens_second: Sequence[int],
ngrams_length: int = 3) -> float:
'''
The function returns the value of the Jakkar coefficient
@param tokens_first - list of tokens of the first program
@param tokens_second - list of tokens of the second program
'''
ngrams_first = generate_ngrams(tokens_first, ngrams_length, unique=True)
ngrams_second = generate_ngrams(tokens_second, ngrams_length, unique=True)
ngrams_first: Set[Tuple[int, ...]] = generate_ngrams(
tokens_first, ngrams_length, unique=True
)
ngrams_second: Set[Tuple[int, ...]] = generate_ngrams(
tokens_second, ngrams_length, unique=True
)

intersection = len(ngrams_first.intersection(ngrams_second))
union = len(ngrams_first | ngrams_second)
Expand All @@ -73,7 +87,7 @@ def value_jakkar_coef(tokens_first, tokens_second, ngrams_length=3):


# equal to the Levenshtein length
def lcs(X, Y):
def lcs(X: Sequence[int], Y: Sequence[int]) -> int:
'''
The function returns the length of the longest common subsequence
of two sequences X and Y.
Expand All @@ -94,15 +108,15 @@ def lcs(X, Y):
for j in range(n + 1):
if i == 0 or j == 0:
L[i][j] = 0
elif X[i-1] == Y[j-1]:
L[i][j] = L[i-1][j-1] + 1
elif X[i - 1] == Y[j - 1]:
L[i][j] = L[i - 1][j - 1] + 1
else:
L[i][j] = max(L[i-1][j], L[i][j-1])
L[i][j] = max(L[i - 1][j], L[i][j - 1])

return L[m][n]


def lcs_based_coeff(subseq1, subseq2):
def lcs_based_coeff(subseq1: Sequence[int], subseq2: Sequence[int]) -> float:
"""The function returns coefficient based on the length
of the longest common subsequence.
This coefficient describes how same two sequences.
Expand Down
Loading

0 comments on commit b32a542

Please sign in to comment.