-
Notifications
You must be signed in to change notification settings - Fork 294
Chiel dat 3098 create dockerized dev setup for xdiff #2
Changes from 6 commits
f4cb455
6ecf455
d122b3b
9bcd531
7ceac8d
6e8edd2
1961a64
4b6e004
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,10 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# dev | ||
ml-25m | ||
drive | ||
|
||
# Mac | ||
.DS_Store |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
version: "3.8" | ||
|
||
services: | ||
postgres: | ||
container_name: postgresql | ||
image: postgres:14.1-alpine | ||
command: > | ||
-c work_mem=1GB | ||
-c maintenance_work_mem=1GB | ||
-c max_wal_size=8GB | ||
restart: always | ||
volumes: | ||
- postgresql-data:/var/lib/postgresql/data:delegated | ||
ports: | ||
- '5432:5432' | ||
expose: | ||
- '5432' | ||
env_file: | ||
- .env | ||
tty: true | ||
networks: | ||
- local | ||
|
||
mysql: | ||
container_name: mysql | ||
image: "${MYSQL_IMAGE:-mysql}" | ||
command: > | ||
--default-authentication-plugin=mysql_native_password | ||
--innodb-buffer-pool-size=8G | ||
--key_buffer_size=8G | ||
--read_buffer_size=2G | ||
--max_connections=10 | ||
--innodb_io_capacity=400 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
restart: always | ||
volumes: | ||
- mysql-data:/var/lib/mysql:delegated | ||
user: mysql | ||
ports: | ||
- '3306:3306' | ||
expose: | ||
- '3306' | ||
env_file: | ||
- .env | ||
tty: true | ||
networks: | ||
- local | ||
|
||
volumes: | ||
postgresql-data: | ||
mysql-data: | ||
|
||
networks: | ||
local: | ||
driver: bridge |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env bash | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would call this |
||
set -ex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ I sense a fellow scarred basher |
||
|
||
if [[ $( dirname $0 ) != "." ]]; then | ||
echo "Execute from /dev folder." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please make this execute from the root directory as vanilla I know this is partly because of
|
||
exit | ||
fi | ||
|
||
# Use a linux/arm64 docker image for MySQL when running on ARM | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This worked automatically for me on the oracle images, did it not for you? Are you sure we need this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, some images don't support the architectures. So you need a specific one. That is the case for this one. |
||
CPU_ARCHITECTURE=$(uname -p) | ||
if [[ "${CPU_ARCHITECTURE}" == "arm" ]]; then | ||
MYSQL_IMAGE="arm64v8/mysql:oracle" | ||
fi | ||
|
||
main () { | ||
initialize | ||
prepaire_db | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prepare |
||
xdiff | ||
shutdown | ||
} | ||
|
||
initialize() { | ||
pip install poetry | ||
poetry install | ||
pip install preql==0.2.10 # Temporary due to version conflicts for runtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0.2.11 now (or higher) |
||
|
||
if [ ! -f ./ml-25m/ratings.csv ]; then | ||
echo "Example data not found. Downloading.." | ||
wget https://files.grouplens.org/datasets/movielens/ml-25m.zip | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might we worth moving this to a |
||
unzip ml-25m.zip | ||
fi | ||
MYSQL_IMAGE=${MYSQL_IMAGE} docker-compose up -d | ||
sleep 15 # Increase if you receive error like: `mysql.connector.errors.InterfaceError: 2013: Lost connection to MySQL server during query` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe instead of sleep, check for the connection every few seconds? Sounds more robust to me. |
||
} | ||
|
||
prepaire_db() { | ||
preql -m prepare_db mysql://mysql:Password1@localhost/mysql | ||
preql -m prepare_db postgres://postgres:Password1@localhost/postgres | ||
} | ||
|
||
xdiff() { | ||
poetry run xdiff postgres://postgres:Password1@localhost/postgres Rating mysql://mysql:Password1@localhost/mysql Rating_del1p -c timestamp --stats -v | ||
} | ||
|
||
shutdown() { | ||
docker-compose down | ||
} | ||
|
||
main |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
// // Declare table & functions | ||
func run_sql(code) { | ||
print code | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it just adds too much clutter. If you want to see the SQL statements, you can add Like preql -m prepare_db --print-sql There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, makes sense |
||
force_eval( SQL( nulltype, code )) | ||
} | ||
|
||
|
@@ -45,16 +46,20 @@ middle = count(Rating) /~ 2 | |
// - We use 'const table' to avoid updating the ids | ||
|
||
// Rating_del1 = Delete middle row | ||
print "Create Rating_del1" | ||
const table Rating_del1 = Rating | ||
Rating_del1.add_index("id") | ||
Rating_del1[middle..(middle+1)] delete [true] | ||
assert count(Rating) == count(Rating_del1) + 1 | ||
|
||
// Rating_del1 = Update middle row | ||
// Rating_update1 = Update middle row | ||
print "Create Rating_update1" | ||
const table Rating_update1 = Rating | ||
Rating_update1.add_index("id") | ||
Rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1} | ||
|
||
// Rating_<>p = Percentile of rows changed | ||
print "Create percentile tables" | ||
const table Rating_update001p = Rating | ||
const table Rating_update1p = Rating | ||
const table Rating_del1p = Rating | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
[tool.poetry] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love this as a separate I'm not super familiar with Python, but I think we should follow standard conventions for contributions if possible. Presumably those are that you go to the main directory, run |
||
name = "xdiff-infra" | ||
version = "0.1.0" | ||
description = "" | ||
authors = ["chiel <chiel@datafold.com>"] | ||
license = "MIT" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.7" | ||
psycopg2 = "^2.9.3" | ||
mysql-connector-python = "^8.0" | ||
datafold-xdiff = { path = "../", develop = true } | ||
|
||
[build-system] | ||
requires = ["poetry-core>=1.0.0"] | ||
build-backend = "poetry.core.masonry.api" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,18 @@ | ||
# Test XDiff with Postgres and MySQL | ||
From inside the `dev` directory, run the following: | ||
|
||
``` | ||
chmod +x ./example.sh | ||
./example.sh | ||
``` | ||
|
||
NB for Mac. If the process takes very long (e.g. importing CSV file takes >30m), make sure that you have the latest version of Docker installed and have enabled the experimental features `Use the new Virtualization framework` and `Enable VirtioFS accelerated directory sharing`. Because the interaction with Docker and the MacOS FS is a bottleneck. | ||
|
||
## Manual setup | ||
|
||
1. Install XDiff | ||
|
||
either `pip install xdiff` or | ||
`pip install xdiff -e ../` | ||
|
||
2. Install Preql (0.2.9 or up) | ||
|
||
|
@@ -20,15 +30,15 @@ unzip ml-25m.zip | |
(note: bigquery and mssql have their own setup scripts) | ||
|
||
``` | ||
preql -m prepare_db postgres://<uri> | ||
preql -m prepare_db postgres://<uri> | ||
|
||
preql -m prepare_db mysql://<uri> | ||
preql -m prepare_db mysql://<uri> | ||
|
||
preql -m prepare_db snowflake://<uri> | ||
preql -m prepare_db snowflake://<uri> | ||
|
||
preql -m prepare_db_bigquery bigquery:///<project> | ||
preql -m prepare_db_bigquery bigquery:///<project> | ||
|
||
preql -m prepare_db_mssql mssql://<uri> | ||
preql -m prepare_db_mssql mssql://<uri> | ||
|
||
|
||
etc. | ||
|
@@ -39,10 +49,10 @@ And it's ready to use! | |
Example: | ||
|
||
```bash | ||
xdiff postgres:/// Rating postgres:/// Rating_del1p -c timestamp --stats | ||
xdiff postgres://<uri> Rating postgres://<uri> Rating_del1 -c timestamp --stats | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically, |
||
|
||
Diff-Total: 250156 changed rows out of 25000095 | ||
Diff-Percent: 1.0006% | ||
Diff-Split: +250156 -0 | ||
|
||
``` | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,16 +24,19 @@ classifiers = [ | |
packages = [{ include = "xdiff" }] | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.6" | ||
runtype = "^0.1.16" | ||
python = "^3.7" | ||
runtype = "^0.2.4" | ||
dsnparse = "*" | ||
pyparsing = "ˆ3.0" | ||
click = "^8.1" | ||
|
||
psycopg2 = { version = "*", optional = true } | ||
mysqlclient = { version = "*", optional = true } | ||
mysql-connector-python = { version = "*", optional = true} | ||
snowflake-connector-python = { version = "*", optional = true } | ||
|
||
[tool.poetry.extras] | ||
mysql = ["mysqlclient"] | ||
mysql = ["mysql-connector-python"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍🏻 ran into this too |
||
pgsql = ["psycopg2"] | ||
snowflake = ["snowflake-connector-python"] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why was this necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, for the size of the movielens dataset, the defaults are too low.