Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit ee35ee2

Browse files
authored
Merge pull request #31 from datafold/editorconfig
Added editorconfig; updated files
2 parents 8e654d0 + 8e54102 commit ee35ee2

File tree

8 files changed

+935
-919
lines changed

8 files changed

+935
-919
lines changed

.editorconfig

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# EditorConfig spec for a consistent cross-editor style.
2+
# Read more: https://EditorConfig.org
3+
4+
root = true
5+
6+
[*]
7+
end_of_line = lf # Unix-style newlines with a newline ending every file
8+
insert_final_newline = true
9+
trim_trailing_whitespace = true
10+
# 4 space indentation
11+
indent_style = space
12+
indent_size = 4
13+
14+
[*.{md,py}]
15+
charset = utf-8
16+

data_diff/__init__.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
from typing import Tuple
2-
3-
from .database import connect_to_uri
4-
from .diff_tables import TableSegment, TableDiffer
5-
6-
7-
def create_source(db_uri: str, table_name: str, key_column: str, extra_columns: Tuple[str, ...] = ()):
8-
db = connect_to_uri(db_uri)
9-
return TableSegment(db, (table_name,), key_column, tuple(extra_columns))
1+
from typing import Tuple
2+
3+
from .database import connect_to_uri
4+
from .diff_tables import TableSegment, TableDiffer
5+
6+
7+
def create_source(db_uri: str, table_name: str, key_column: str, extra_columns: Tuple[str, ...] = ()):
8+
db = connect_to_uri(db_uri)
9+
return TableSegment(db, (table_name,), key_column, tuple(extra_columns))

data_diff/__main__.py

Lines changed: 115 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,115 +1,115 @@
1-
from multiprocessing.sharedctypes import Value
2-
import sys
3-
import time
4-
import logging
5-
from itertools import islice
6-
7-
from .diff_tables import TableSegment, TableDiffer
8-
from .database import connect_to_uri
9-
from .parse_time import parse_time_before_now, UNITS_STR, ParseError
10-
11-
import click
12-
13-
LOG_FORMAT = "[%(asctime)s] %(levelname)s - %(message)s"
14-
DATE_FORMAT = "%H:%M:%S"
15-
16-
17-
@click.command()
18-
@click.argument("db1_uri")
19-
@click.argument("table1_name")
20-
@click.argument("db2_uri")
21-
@click.argument("table2_name")
22-
@click.option("-k", "--key-column", default="id", help="Name of primary key column")
23-
@click.option("-t", "--update-column", default=None, help="Name of updated_at/last_updated column")
24-
@click.option("-c", "--columns", default=[], multiple=True, help="Names of extra columns to compare")
25-
@click.option("-l", "--limit", default=None, help="Maximum number of differences to find")
26-
@click.option("--bisection-factor", default=32, help="Segments per iteration")
27-
@click.option("--bisection-threshold", default=1024**2, help="Minimal bisection threshold")
28-
@click.option(
29-
"--min-age",
30-
default=None,
31-
help="Considers only rows older than specified. "
32-
"Example: --min-age=5min ignores rows from the last 5 minutes. "
33-
f"\nValid units: {UNITS_STR}",
34-
)
35-
@click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
36-
@click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
37-
@click.option("-d", "--debug", is_flag=True, help="Print debug info")
38-
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
39-
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
40-
def main(
41-
db1_uri,
42-
table1_name,
43-
db2_uri,
44-
table2_name,
45-
key_column,
46-
update_column,
47-
columns,
48-
limit,
49-
bisection_factor,
50-
bisection_threshold,
51-
min_age,
52-
max_age,
53-
stats,
54-
debug,
55-
verbose,
56-
interactive,
57-
):
58-
if limit and stats:
59-
print("Error: cannot specify a limit when using the -s/--stats switch")
60-
return
61-
if interactive:
62-
debug = True
63-
64-
if debug:
65-
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
66-
elif verbose:
67-
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
68-
69-
db1 = connect_to_uri(db1_uri)
70-
db2 = connect_to_uri(db2_uri)
71-
72-
if interactive:
73-
db1.enable_interactive()
74-
db2.enable_interactive()
75-
76-
start = time.time()
77-
78-
try:
79-
options = dict(
80-
min_time=min_age and parse_time_before_now(min_age), max_time=max_age and parse_time_before_now(max_age)
81-
)
82-
except ParseError as e:
83-
logging.error("Error while parsing age expression: %s" % e)
84-
return
85-
86-
table1 = TableSegment(db1, (table1_name,), key_column, update_column, columns, **options)
87-
table2 = TableSegment(db2, (table2_name,), key_column, update_column, columns, **options)
88-
89-
differ = TableDiffer(bisection_factor=bisection_factor, bisection_threshold=bisection_threshold, debug=debug)
90-
diff_iter = differ.diff_tables(table1, table2)
91-
92-
if limit:
93-
diff_iter = islice(diff_iter, int(limit))
94-
95-
if stats:
96-
diff = list(diff_iter)
97-
unique_diff_count = len({i[0] for _, i in diff})
98-
percent = 100 * unique_diff_count / table1.count
99-
print(f"Diff-Total: {len(diff)} changed rows out of {table1.count}")
100-
print(f"Diff-Percent: {percent:.4f}%")
101-
plus = len([1 for op, _ in diff if op == "+"])
102-
minus = len([1 for op, _ in diff if op == "-"])
103-
print(f"Diff-Split: +{plus} -{minus}")
104-
else:
105-
for op, key in diff_iter:
106-
print(op, key)
107-
sys.stdout.flush()
108-
109-
end = time.time()
110-
111-
logging.info(f"Duration: {end-start:.2f} seconds.")
112-
113-
114-
if __name__ == "__main__":
115-
main()
1+
from multiprocessing.sharedctypes import Value
2+
import sys
3+
import time
4+
import logging
5+
from itertools import islice
6+
7+
from .diff_tables import TableSegment, TableDiffer
8+
from .database import connect_to_uri
9+
from .parse_time import parse_time_before_now, UNITS_STR, ParseError
10+
11+
import click
12+
13+
LOG_FORMAT = "[%(asctime)s] %(levelname)s - %(message)s"
14+
DATE_FORMAT = "%H:%M:%S"
15+
16+
17+
@click.command()
18+
@click.argument("db1_uri")
19+
@click.argument("table1_name")
20+
@click.argument("db2_uri")
21+
@click.argument("table2_name")
22+
@click.option("-k", "--key-column", default="id", help="Name of primary key column")
23+
@click.option("-t", "--update-column", default=None, help="Name of updated_at/last_updated column")
24+
@click.option("-c", "--columns", default=[], multiple=True, help="Names of extra columns to compare")
25+
@click.option("-l", "--limit", default=None, help="Maximum number of differences to find")
26+
@click.option("--bisection-factor", default=32, help="Segments per iteration")
27+
@click.option("--bisection-threshold", default=1024**2, help="Minimal bisection threshold")
28+
@click.option(
29+
"--min-age",
30+
default=None,
31+
help="Considers only rows older than specified. "
32+
"Example: --min-age=5min ignores rows from the last 5 minutes. "
33+
f"\nValid units: {UNITS_STR}",
34+
)
35+
@click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
36+
@click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
37+
@click.option("-d", "--debug", is_flag=True, help="Print debug info")
38+
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
39+
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
40+
def main(
41+
db1_uri,
42+
table1_name,
43+
db2_uri,
44+
table2_name,
45+
key_column,
46+
update_column,
47+
columns,
48+
limit,
49+
bisection_factor,
50+
bisection_threshold,
51+
min_age,
52+
max_age,
53+
stats,
54+
debug,
55+
verbose,
56+
interactive,
57+
):
58+
if limit and stats:
59+
print("Error: cannot specify a limit when using the -s/--stats switch")
60+
return
61+
if interactive:
62+
debug = True
63+
64+
if debug:
65+
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
66+
elif verbose:
67+
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
68+
69+
db1 = connect_to_uri(db1_uri)
70+
db2 = connect_to_uri(db2_uri)
71+
72+
if interactive:
73+
db1.enable_interactive()
74+
db2.enable_interactive()
75+
76+
start = time.time()
77+
78+
try:
79+
options = dict(
80+
min_time=min_age and parse_time_before_now(min_age), max_time=max_age and parse_time_before_now(max_age)
81+
)
82+
except ParseError as e:
83+
logging.error("Error while parsing age expression: %s" % e)
84+
return
85+
86+
table1 = TableSegment(db1, (table1_name,), key_column, update_column, columns, **options)
87+
table2 = TableSegment(db2, (table2_name,), key_column, update_column, columns, **options)
88+
89+
differ = TableDiffer(bisection_factor=bisection_factor, bisection_threshold=bisection_threshold, debug=debug)
90+
diff_iter = differ.diff_tables(table1, table2)
91+
92+
if limit:
93+
diff_iter = islice(diff_iter, int(limit))
94+
95+
if stats:
96+
diff = list(diff_iter)
97+
unique_diff_count = len({i[0] for _, i in diff})
98+
percent = 100 * unique_diff_count / table1.count
99+
print(f"Diff-Total: {len(diff)} changed rows out of {table1.count}")
100+
print(f"Diff-Percent: {percent:.4f}%")
101+
plus = len([1 for op, _ in diff if op == "+"])
102+
minus = len([1 for op, _ in diff if op == "-"])
103+
print(f"Diff-Split: +{plus} -{minus}")
104+
else:
105+
for op, key in diff_iter:
106+
print(op, key)
107+
sys.stdout.flush()
108+
109+
end = time.time()
110+
111+
logging.info(f"Duration: {end-start:.2f} seconds.")
112+
113+
114+
if __name__ == "__main__":
115+
main()

0 commit comments

Comments
 (0)