Skip to content

Commit

Permalink
refactor: adds parallelizing of the work comparison process
Browse files Browse the repository at this point in the history
- Adds parallelizing of the work comparison process, as well as expanded code documentation;
- The progress indication for many workers was implemented;
- Shows in progress time spent, predicted time left, and count of workers;
- Adds the possibility of changing the number of workers.

Refs: #183
  • Loading branch information
Artanias authored Mar 31, 2024
1 parent 7a2d9a9 commit 4c0d89b
Show file tree
Hide file tree
Showing 19 changed files with 525 additions and 195 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# Developmnt environmemt
.vscode
.python-version
docs/notebooks/.ipynb_checkpoints/

# Build
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
rev: v0.3.4
hooks:
- id: ruff
- repo: local
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
UTIL_VERSION := 0.4.2
UTIL_VERSION := 0.4.3
UTIL_NAME := codeplag
PWD := $(shell pwd)

Expand Down
1 change: 0 additions & 1 deletion src/codeplag/algorithms/tokenbased.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
of two token sequences.
"""


import math
from typing import List, Literal, Sequence, Set, Tuple, Union, overload

Expand Down
9 changes: 9 additions & 0 deletions src/codeplag/codeplagcli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module consist the CLI of the codeplag util and
necessary internal classes for it.
"""

import argparse
from pathlib import Path
from typing import List, Optional
Expand All @@ -16,6 +17,7 @@
REPORTS_EXTENSION_CHOICE,
UTIL_NAME,
UTIL_VERSION,
WORKERS_CHOICE,
)


Expand Down Expand Up @@ -134,6 +136,13 @@ def __add_settings_path(self, subparsers: argparse._SubParsersAction) -> None:
type=str,
choices=LANGUAGE_CHOICE,
)
settings_modify.add_argument(
"-w",
"--workers",
help="The maximum number of processes that can be used to compare works.",
type=int,
choices=WORKERS_CHOICE,
)

# settings show
settings_commands.add_parser(
Expand Down
2 changes: 2 additions & 0 deletions src/codeplag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_LANGUAGE,
DEFAULT_REPORT_EXTENSION,
DEFAULT_THRESHOLD,
DEFAULT_WORKERS,
)
from codeplag.logger import codeplag_logger as logger
from codeplag.types import Settings
Expand Down Expand Up @@ -85,4 +86,5 @@ def write_settings_conf(settings: Settings) -> None:
show_progress=0,
reports_extension=DEFAULT_REPORT_EXTENSION,
language=DEFAULT_LANGUAGE,
workers=DEFAULT_WORKERS,
)
20 changes: 15 additions & 5 deletions src/codeplag/consts.tmp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
from pathlib import Path
from typing import Dict, Final, List, Tuple
from typing import Dict, Final, List, Tuple, get_args

from codeplag.types import (
Extension,
Expand All @@ -22,13 +23,16 @@
"ru": Path("@LIB_PATH@/report_ru.templ"),
"en": Path("@LIB_PATH@/report_en.templ"),
}
# =====

# Default values
DEFAULT_THRESHOLD: Final[Threshold] = 65
DEFAULT_WEIGHTS: Final[Tuple[float, float, float, float]] = (1.0, 0.4, 0.4, 0.4)
DEFAULT_LANGUAGE: Final[Language] = "en"
DEFAULT_REPORT_EXTENSION: Final[ReportsExtension] = "csv"
DEFAULT_GENERAL_REPORT_NAME: Final[str] = "report.html"
DEFAULT_WORKERS: Final[int] = os.cpu_count() or 1
# =============

GET_FRAZE: Final[str] = "Getting works features from"

Expand All @@ -52,10 +56,16 @@
"compliance_matrix",
)

MODE_CHOICE: Final[List[Mode]] = ["many_to_many", "one_to_one"]
REPORTS_EXTENSION_CHOICE: Final[List[ReportsExtension]] = ["csv", "json"]
EXTENSION_CHOICE: Final[List[Extension]] = ["py", "cpp"]
LANGUAGE_CHOICE: Final[List[Language]] = ["en", "ru"]
# Choices
MODE_CHOICE: Final[Tuple[Mode, ...]] = get_args(Mode)
REPORTS_EXTENSION_CHOICE: Final[Tuple[ReportsExtension, ...]] = get_args(
ReportsExtension
)
EXTENSION_CHOICE: Final[Tuple[Extension, ...]] = get_args(Extension)
LANGUAGE_CHOICE: Final[Tuple[Language, ...]] = get_args(Language)
WORKERS_CHOICE: Final[List[int]] = list(range(1, DEFAULT_WORKERS + 1))
# =======

ALL_EXTENSIONS: Final[Tuple[re.Pattern]] = (re.compile(r"\..*$"),)
# Don't checks changing values by key
SUPPORTED_EXTENSIONS: Final[Dict[Extension, Extensions]] = {
Expand Down
91 changes: 82 additions & 9 deletions src/codeplag/display.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from enum import Enum
from functools import partial
from typing import List, Optional
from typing import Final, List, Optional

import pandas as pd
from typing_extensions import Self

from codeplag.types import ASTFeatures, CompareInfo, NodeCodePlace

CHARS_CNT: Final[int] = 40
USEFUL_CHARS: Final[int] = 100


class Color(Enum):
HEADER = "\033[95m"
Expand All @@ -22,6 +26,7 @@ class Color(Enum):
def colorize(
text: str, color: Color, bold: bool = False, underline: bool = False
) -> str:
"""Wraps provided text to change color, bold, or underline it for printing."""
if bold:
text = f"{Color.BOLD.value}{text}"
if underline:
Expand Down Expand Up @@ -76,22 +81,27 @@ def print_code_and_highlight_suspect(
column += 1


def clear_line() -> None:
print(" " * USEFUL_CHARS, end="\r")


def print_compare_result(
features1: ASTFeatures,
features2: ASTFeatures,
compare_info: CompareInfo,
compliance_matrix_df: Optional[pd.DataFrame] = None,
) -> None:
"""The function prints the result of comparing two files
"""Prints the pretty result of comparing two files.
@features1 - the features of the first source file
@features2 - the features of the second source file
@compare_info - structure consist compare metrics of two works
@threshold - threshold of plagiarism searcher alarm
Args:
features1: The features of the first source file.
features2: The features of the second source file.
compare_info: The compare metrics of two works.
threshold: Threshold of plagiarism searcher alarm.
"""

print(" " * 40)
print("+" * 40)
clear_line()
print("+" * CHARS_CNT)
if features1.modify_date is not None and features2.modify_date is not None:
message = (
"-----\n"
Expand Down Expand Up @@ -128,4 +138,67 @@ def print_compare_result(
if compliance_matrix_df is not None:
print(compliance_matrix_df, "\n")

print("+" * 40)
print("+" * CHARS_CNT)


class Progress:
def __init__(self, iterations: int) -> None:
self.__iterations: Final[int] = iterations
self.__iteration: int = -1

@property
def progress(self) -> float:
if self.iterations == 0:
return 1.0
if self.__iteration <= 0:
return 0.0
return self.__iteration / self.iterations

@property
def iterations(self) -> int:
return self.__iterations if self.__iterations > 0 else 0

def __iter__(self) -> Self:
return self

def __next__(self) -> float:
if self.progress == 1.0:
raise StopIteration("The progress has already been completed.")
self.__iteration += 1
return self.progress

def __str__(self) -> str:
return f"Progress: {self.progress:.2%}"


class ComplexProgress(Progress):
def __init__(self, iterations: int) -> None:
super(ComplexProgress, self).__init__(iterations)
self.__internal_progresses: List[Progress] = []

def add_internal_progress(self, internal_iterations: int) -> None:
if len(self.__internal_progresses) == self.iterations:
raise IndexError("The internal iteration count limit was exceeded.")
self.__internal_progresses.append(Progress(internal_iterations))

@property
def progress(self) -> float:
if self.iterations == 0:
return 1.0
return float(
sum(
internal_progress.progress / self.iterations
for internal_progress in self.__internal_progresses
)
)

def __next__(self) -> float:
if self.progress == 1.0:
raise StopIteration("The progress has already been completed.")
for internal_progress in self.__internal_progresses:
if internal_progress.progress == 1.0:
continue
if next(internal_progress) == 1.0:
continue
break
return self.progress
16 changes: 11 additions & 5 deletions src/codeplag/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

from codeplag.consts import UTIL_NAME
from codeplag.display import error, info, red_bold, warning
from codeplag.display import clear_line, error, info, red_bold, warning


class StreamFormatter(logging.Formatter):
Expand All @@ -27,6 +27,12 @@ def format(self, record: logging.LogRecord) -> str:
return super().format(record)


class CustomStreamHandler(logging.StreamHandler):
def emit(self, record: logging.LogRecord):
clear_line()
super(CustomStreamHandler, self).emit(record)


def get_file_handler(filename: Path) -> logging.FileHandler:
log_format = (
"%(asctime)s - [%(levelname)s] - %(name)s - "
Expand All @@ -44,20 +50,20 @@ def get_file_handler(filename: Path) -> logging.FileHandler:
return file_handler


def get_stderr_handler() -> logging.StreamHandler:
stderr_handler = logging.StreamHandler(stream=sys.stderr)
def get_stderr_handler() -> CustomStreamHandler:
stderr_handler = CustomStreamHandler(stream=sys.stderr)
stderr_handler.setLevel(logging.WARNING)
stderr_handler.setFormatter(StreamFormatter())

return stderr_handler


def get_stdout_handler() -> logging.StreamHandler:
def get_stdout_handler() -> CustomStreamHandler:
class STDOutFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
return record.levelno in [logging.INFO, logging.DEBUG]

stdout_handler = logging.StreamHandler(stream=sys.stdout)
stdout_handler = CustomStreamHandler(stream=sys.stdout)
stdout_handler.setLevel(logging.DEBUG)
stdout_handler.addFilter(STDOutFilter())
stdout_handler.setFormatter(StreamFormatter())
Expand Down
39 changes: 36 additions & 3 deletions src/codeplag/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
from concurrent.futures import Future
from dataclasses import dataclass, field
from functools import total_ordering
from pathlib import Path
Expand Down Expand Up @@ -48,6 +49,14 @@ class NodeStructurePlace(NamedTuple):
uid: int


def __return_zero() -> Literal[0]:
return 0


def _create_a_defaultdict_returning_zero_by_default() -> DefaultDict[str, int]:
return defaultdict(__return_zero)


@total_ordering
@dataclass
class ASTFeatures:
Expand All @@ -59,13 +68,13 @@ class ASTFeatures:
count_of_nodes: int = 0
head_nodes: List[str] = field(default_factory=list)
operators: DefaultDict[str, int] = field(
default_factory=lambda: defaultdict(lambda: 0)
default_factory=_create_a_defaultdict_returning_zero_by_default
)
keywords: DefaultDict[str, int] = field(
default_factory=lambda: defaultdict(lambda: 0)
default_factory=_create_a_defaultdict_returning_zero_by_default
)
literals: DefaultDict[str, int] = field(
default_factory=lambda: defaultdict(lambda: 0)
default_factory=_create_a_defaultdict_returning_zero_by_default
)

# unique nodes
Expand Down Expand Up @@ -134,11 +143,35 @@ class Settings(TypedDict):
reports_extension: ReportsExtension
show_progress: Flag
threshold: Threshold
workers: int


class SameHead(NamedTuple):
name: str
percent: float


class ProcessingWorksInfo(NamedTuple):
work1: ASTFeatures
work2: ASTFeatures
compare_future: Future


SameFuncs = Dict[str, List[SameHead]]


# Problem title: Pickling of a namedtuple instance succeeds normally,
# but fails when module is Cythonized.
# -----
# In order for pickle to work, the attribute __module__ of the some type must
# be set and should be correct.
# namedtuple uses a trick/heuristic (i.e lookup in sys._getframe(1).f_globals)
# to get this information. The problem with the Cython- or C-extensions is that,
# this heuristic will not work and _sys._getframe(1).f_globals.get('__name__', '__main__')
# will yield importlib._bootstrap and not correct module.
# To fix that you need to pass right module-name to namedtuple-factory
NodeCodePlace.__module__ = __name__
NodeStructurePlace.__module__ = __name__
FastMetrics.__module__ = __name__
StructuresInfo.__module__ = __name__
CompareInfo.__module__ = __name__
Loading

0 comments on commit 4c0d89b

Please sign in to comment.