Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 319 additions & 0 deletions TPs/lcs-observation.ipynb

Large diffs are not rendered by default.

40 changes: 36 additions & 4 deletions complexity/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,42 @@ def generate(self, size: int = 1) -> int:

#TODO:add the string geneation logic
class StringGenerator(DataGenerator):
def __init__(self,alphabit=['A','B','C']):
pass
def generate(self, size: int = 1) -> int:
pass
def __init__(self,alphabet=['A','B','C']): # it's alphabet not alphabit hhhh
"""Init the generator class with a list of characters"""
self.alphabet = alphabet

def generate(self, size: int = 1) -> str:
"""generate random string based on the given size"""
return "".join(random.choices(self.alphabet, k=size))

def generate_pair(self, m: int, n: int, similarity: float=0.0) -> tuple[str, str]:
"""generate a pair of strings based on the lengths m and n and similarity"""
try:
if similarity > 1.0 or similarity < 0.0:
raise ValueError("Similarity must be between 0.0 and 1.0")

# generate a string based on the max(m and n)
base_string = self.generate(max(m, n))

first_str = base_string[:m]

last_str = []
for i in range(n):
if i < m and random.random() < similarity:
# Use the same character as in first string to maintain similarity
last_str.append(first_str[i])
else:
# Use
last_str.append(random.choice(self.alphabet))

return first_str, "".join(last_str)

except ValueError as e:
print(f"Error {e}")
# return an empty tuple
return "", ""



class GraphGenerator(DataGenerator):
def __init__(self, directed: bool = False, weighted: bool = True):
Expand Down
218 changes: 216 additions & 2 deletions examples/common_string.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,218 @@
#TODO: put the code here
import sys
from pathlib import Path

print('Please use the complexity library')
import random
import pandas as pd
from tqdm import tqdm
from collections import namedtuple

from complexity import (
DataGeneratorFactory,
TimeAndSpaceProfiler,
StringGenerator
)

sys.path.append(str(Path(__file__).parent.parent))

from collections import namedtuple

Metrics = namedtuple('Metrics', ['n', 'm', 'result', 'comparison_count'])

# ======================================================================= #
def lcs_recursive(X, Y, i, j, comparison_count=0):
# Base case
if i == 0 or j == 0:
comparison_count += 1 # Counting the comparison
return Metrics(n=i, m=j, result=0, comparison_count=comparison_count)

# If characters match, add 1 to result and call LCS for remaining parts
elif X[i-1] == Y[j-1]:
comparison_count += 1 # Counting the comparison
return Metrics(n=i, m=j, result=1 + lcs_recursive(X, Y, i-1, j-1, comparison_count).result,
comparison_count=comparison_count)

# Otherwise, take the maximum of two possible subproblems
else:
comparison_count += 1 # Counting the comparison
left = lcs_recursive(X, Y, i-1, j, comparison_count)
right = lcs_recursive(X, Y, i, j-1, comparison_count)
return Metrics(n=i, m=j, result=max(left.result, right.result), comparison_count=comparison_count)


# ======================================================================= #
def lcs_memoized(X, Y, i, j, memo, comparison_count=0):
# Base case
if i == 0 or j == 0:
return Metrics(n=i, m=j, result=0, comparison_count=comparison_count)

# If result is already calculated then return it
if memo[i][j] is not None:
return Metrics(n=i, m=j, result=memo[i][j], comparison_count=comparison_count)

# If characters match
if X[i-1] == Y[j-1]:
comparison_count += 1 # Counting the comparison
memo[i][j] = lcs_memoized(X, Y, i-1, j-1, memo, comparison_count).result + 1
else:
comparison_count += 1 # Counting the comparison
memo[i][j] = max(lcs_memoized(X, Y, i-1, j, memo, comparison_count).result,
lcs_memoized(X, Y, i, j-1, memo, comparison_count).result)

return Metrics(n=i, m=j, result=memo[i][j], comparison_count=comparison_count)


# ======================================================================= #
def lcs_dp(X, Y):
m, n = len(X), len(Y)
# Initialize DP table
dp = [[0] * (n+1) for _ in range(m+1)]
comparison_count = 0 # Initialize comparison counter

# Fill DP table
for i in range(1, m+1):
for j in range(1, n+1):
comparison_count += 1 # Counting the comparison (X[i-1] == Y[j-1])
if X[i-1] == Y[j-1]: # If characters match
dp[i][j] = 1 + dp[i-1][j-1]
else: # Take max of excluding current character from X or Y
dp[i][j] = max(dp[i-1][j], dp[i][j-1])

return Metrics(n=m, m=n, result=dp[m][n], comparison_count=comparison_count)


# ======================================================================= #
def lcs_dp_optimized(X, Y):
m, n = len(X), len(Y)
# Use two rows instead of full DP table
prev = [0] * (n+1)
curr = [0] * (n+1)
comparison_count = 0 # Initialize comparison counter

for i in range(1, m+1):
for j in range(1, n+1):
comparison_count += 1 # Counting the comparison (X[i-1] == Y[j-1])
if X[i-1] == Y[j-1]:
curr[j] = 1 + prev[j-1]
else:
curr[j] = max(prev[j], curr[j-1])
prev, curr = curr, prev # Swap rows

return Metrics(n=m, m=n, result=prev[n], comparison_count=comparison_count)


# ======================================================================= #
def lcs_reconstruct(X, Y):
m, n = len(X), len(Y)
dp = [[0] * (n+1) for _ in range(m+1)]
comparison_count = 0 # Initialize comparison counter

# Fill DP table
for i in range(1, m+1):
for j in range(1, n+1):
comparison_count += 1 # Counting the comparison (X[i-1] == Y[j-1])
if X[i-1] == Y[j-1]:
dp[i][j] = 1 + dp[i-1][j-1]
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])

# Backtrack to reconstruct LCS
lcs = []
i, j = m, n
while i > 0 and j > 0:
comparison_count += 1 # Counting the comparison (X[i-1] == Y[j-1])
if X[i-1] == Y[j-1]:
lcs.append(X[i-1])
i -= 1
j -= 1
elif dp[i-1][j] > dp[i][j-1]:
i -= 1
else:
j -= 1

return Metrics(n=m, m=n, result=len(''.join(reversed(lcs))), comparison_count=comparison_count)



# Set up data generators
factory = DataGeneratorFactory()

# Customize alphabet dynamically
default_alphabet = [chr(i) for i in range(65, 91)] # all the alphabet
factory.register_generator("random_string", StringGenerator(alphabet=default_alphabet))

# Ensure reproducibility by setting a random seed
random.seed(42) # Replace 42 with any seed value for consistent results

# Define ranges for each experiment
ranges = [100, 500, 1000] # Modify as per your requirement
nbr_experiments = 20 # Number of experiments per size

# Include specific edge cases
edge_cases = [
(1, 1, 0.0), # Minimum size, completely dissimilar
(1, 1, 1.0), # Minimum size, identical
(1000, 1000, 0.0), # Large size, completely dissimilar
(1000, 1000, 1.0), # Large size, identical
]

# Generate pairs of strings with different lengths (m and n) and similarity for each experiment
random_string_pairs = [
[
# Generate m, n, and similarity within specific ranges
factory.get_generator("random_string").generate_pair(
m=random.randint(1, size), # Random length m from 1 to 'size'
n=random.randint(1, size), # Random length n from 1 to 'size'
similarity=random.uniform(0.0, 1.0) # Random similarity between 0.0 and 1.0
) for _ in range(nbr_experiments)
]
for size in ranges
]

# Add edge cases explicitly to the dataset
for m, n, similarity in edge_cases:
random_string_pairs.append([
factory.get_generator("random_string").generate_pair(m=m, n=n, similarity=similarity)
])
# Functions to benchmark
funcs = [
lcs_recursive,
lcs_memoized,
lcs_dp,
lcs_dp_optimized,
lcs_reconstruct,
]

# Benchmarking
profiler = TimeAndSpaceProfiler()
results = []

# Create a tqdm progress bar for tracking the experiments
total_iterations = len(funcs) * len(ranges) * nbr_experiments
print(f"\nRunning {total_iterations} LCS function experiments...")
with tqdm(total=total_iterations, desc="Benchmarking", unit="experiment") as pbar:
for func in funcs:
for size, experiments in zip(ranges, random_string_pairs):
for experiment_idx, data in enumerate(experiments):
# Run and profile
X, Y = data # The two strings to compare
if func == lcs_recursive:
logs = profiler.profile(func, X, Y, len(X), len(Y))
elif func == lcs_memoized:
memo = [[None] * (len(Y)+1) for _ in range(len(X)+1)]
logs = profiler.profile(func, X, Y, len(X), len(Y), memo)

else:
logs = profiler.profile(func, X, Y)


results.append({
"size": size,
**logs
})
pbar.update(1)


# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("benchmark_results.csv", index=False)
print("\nBenchmarking completed. Results saved to 'benchmark_results.csv'.")