Python-LZ77-Compression/compresor.py at master · ddiazghub/Python-LZ77-Compression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from timeit import Timer
from argparse import ArgumentParser
from constants import LENGTH_THRESHOLD, WINDOW_SIZE, CHUNK_SIZE, MAX_REF_LENGTH
from reference import Reference

def window_match(lookahead: bytes, window: bytes) -> Reference:
    """Realiza una búsqueda en la ventana de referencia para encontrar una sequencia que coincida con la sequencia iniciada con el byte actual que se está leyendo.

    Args:
        lookahead (bytes): Buffer de bytes que contiene el byte actual y todos los que están despues de este.
        window (bytes): Ventana de referencia para buscar ocurrencias anteriores de la sequencia actual.

    Returns:
        Reference: Se retorna una referencia a una ocurrencia pasada de la secuencia actual. En caso contrario se retorna una referencia de longitud 0.
    """
    current = lookahead[0]
    window_length = len(window)
    lookahead_length = len(lookahead)
    longest = Reference(0, 0, current)
    found = window.find(current)

    while found > -1:
        offset = window_length - found
        max_length = min(offset, lookahead_length - 1)

        if max_length > longest.length:
            for i in range(1, max_length):
                if window[found + i] != lookahead[i]:
                    if i > longest.length:
                        longest = Reference(offset, i, lookahead[i])

                    break
            else:
                longest = Reference(offset, max_length, lookahead[max_length])

        if longest.length > LENGTH_THRESHOLD:
            return longest

        found = window.find(current, found + 1)

    return longest

def process_chunk(chunk: bytes, offset: int) -> bytearray:
    """Comprime una parte del archivo, a partir de una determinada posición.

    Args:
        chunk (str): La parte del archivo que se va a comprimir.
        offset (int): Posición a partir de la cual iniciar a comprimir.

    Returns:
        bytearray: La parte comprimida en bytes.
    """
    output = bytearray()
    iterator = iter(range(offset, len(chunk)))

    for i in iterator:
        window = chunk[max(i - WINDOW_SIZE, 0): i]
        lookahead = chunk[i:i + MAX_REF_LENGTH]
        matched = window_match(lookahead, window)
        output.extend(matched.to_bytes())

        for _ in range(matched.length):
            next(iterator)

    return output

def compress(filename: str, outfile: str):
    """Comprime un archivo utilizando el algoritmo LZ77.

    Args:
        filename (str): El archivo a comprimir.
        outfile (str): El archivo comprimido de salida.
    """
    with open(filename, "rb") as file, open(outfile, "wb") as out:
        buffer = b""

        while chunk := file.read(CHUNK_SIZE):
            window = buffer[-WINDOW_SIZE:]
            buffer = window + chunk
            output = process_chunk(buffer, len(window))
            out.write(output)

if __name__ == "__main__":
    parser = ArgumentParser(
        prog="Compresor LZ77",
        description="Comprime un archivo usando el algoritmo LZ77"
    )

    parser.add_argument("filename", help="Archivo a comprimir")
    parser.add_argument("-o", "--outfile", help="Nombre del archivo comprimido", default="comprimido.elmejorprofesor")

    args = parser.parse_args()
    filename, outfile = args.filename, args.outfile
    timer = Timer(lambda: compress(filename, outfile))

    print(timer.timeit(1))