Skip to content

Commit

Permalink
dev(narugo): add auto run
Browse files Browse the repository at this point in the history
  • Loading branch information
narugo1992 committed Dec 28, 2023
1 parent 251e287 commit 1272076
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 1 deletion.
76 changes: 76 additions & 0 deletions .github/workflows/repack.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#file: noinspection YAMLSchemaValidation
name: Repacker

on:
push:
workflow_dispatch:
schedule:
- cron: '0 14 * * *'

jobs:
unittest:
name: Code Test Repacker
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- 'ubuntu-latest'
python-version:
- '3.8'

steps:
- name: Get system version for Linux
if: ${{ contains(matrix.os, 'ubuntu') }}
shell: bash
run: |
echo "OS_NAME=Linux" >> $GITHUB_ENV
echo "IS_WIN=" >> $GITHUB_ENV
echo "IS_MAC=" >> $GITHUB_ENV
- name: Set environment for Cpython
if: ${{ !contains(matrix.python-version, 'pypy') }}
shell: bash
run: |
echo "IS_PYPY=" >> $GITHUB_ENV
- name: Checkout code
uses: actions/checkout@v2
with:
fetch-depth: 20
submodules: 'recursive'
- name: Set up system dependences on Linux
if: ${{ env.OS_NAME == 'Linux' }}
shell: bash
run: |
sudo apt-get update
sudo apt-get install -y tree cloc wget curl make zip
sudo apt-get install -y git-lfs
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
shell: bash
run: |
python -m pip install --upgrade pip
pip install --upgrade flake8 setuptools wheel twine
pip install -r requirements.txt
pip install -r requirements-test.txt
pip install -r requirements-extra.txt
- name: Test the basic environment
shell: bash
run: |
python -V
pip --version
pip list
tree .
cloc pyskeb
cloc test
- name: Run unittest
env:
CI: 'true'
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_SRC_REPO: ${{ secrets.HF_SRC_REPO }}
HF_DST_REPO: ${{ secrets.HF_DST_REPO }}
shell: bash
run: |
python -m skpick.online
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ dghs-imgutils>=0.3.2
tqdm
numpy
pillow
pandas
pandas
natsort
di-toolkit
82 changes: 82 additions & 0 deletions skpick/online.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
import os.path

import pandas as pd
from ditk import logging
from hfutils.operate import get_hf_fs, download_file_to_file, upload_directory_as_directory, get_hf_client
from hfutils.utils import tqdm, TemporaryDirectory
from natsort import natsorted

from .pick import pick_from_package

logging.try_init_root(level=logging.INFO)


def online_pick(src_repo: str, dst_repo: str):
hf_fs = get_hf_fs()
hf_client = get_hf_client()
if hf_fs.exists(f'datasets/{src_repo}/index.json'):
src_packages = [
item['filename'] for item in
json.loads(hf_fs.read_text(f'datasets/{src_repo}/index.json'))
]
else:
src_packages = []

if not hf_client.repo_exists(repo_id=dst_repo, repo_type='dataset'):
hf_client.create_repo(repo_id=dst_repo, repo_type='dataset', private=True, exist_ok=True)
if hf_fs.exists(f'datasets/{dst_repo}/index.json'):
dst_index = json.loads(hf_fs.read_text(f'datasets/{dst_repo}/index.json'))
else:
dst_index = []
dst_packages = [item['filename'] for item in dst_index]

for package in tqdm(natsorted(set(src_packages) - set(dst_packages))[:1]):
with TemporaryDirectory() as td_src, TemporaryDirectory() as td_dst:
zip_file = os.path.join(td_src, package)
download_file_to_file(
local_file=zip_file,
repo_id=src_repo,
repo_type='dataset',
file_in_repo=f'packs/{package}',
)

pick_from_package(zip_file, td_dst)
dst_index.append({
'filename': package,
**{
d: len(os.listdir(os.path.join(td_dst, d)))
for d in os.listdir(td_dst) if os.path.isdir(os.path.join(td_dst, d))
}
})
df_rows = []
names = set()
for item in dst_index[::-1]:
for name in item.keys():
names.add(name)
for item in dst_index[::-1]:
for name in names:
if name not in item:
item[name] = 0
df_rows.append(item)

df = pd.DataFrame(df_rows)
with open(os.path.join(td_dst, 'README.md'), 'w') as f:
print(df.to_markdown(index=False), file=f)
with open(os.path.join(td_dst, 'index.json'), 'w') as f:
json.dump(dst_index, f, indent=4, ensure_ascii=False)

upload_directory_as_directory(
local_directory=td_dst,
repo_id=dst_repo,
repo_type='dataset',
path_in_repo='.',
message=f'Pick from {package!r}',
)


if __name__ == '__main__':
online_pick(
src_repo=os.environ['HF_SRC_REPO'],
dst_repo=os.environ['HF_DST_REPO'],
)
34 changes: 34 additions & 0 deletions skpick/pick.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import shutil

from hfutils.utils import walk_files
from tqdm.auto import tqdm

from .archive import archive_unpack, get_archive_type
from .check import check_type


def pick_from_package(zip_file: str, dst_dir: str):
os.makedirs(dst_dir, exist_ok=True)
for file, relpath in archive_unpack(zip_file):
type_ = check_type(file)
if type_ is not None:
dst_file = os.path.join(dst_dir, type_, relpath)
if os.path.dirname(dst_file):
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
shutil.copyfile(file, dst_file)


def pick_from_dir_of_packages(zip_dir: str, dst_dir: str):
zip_files = []

for file in walk_files(zip_dir):
try:
get_archive_type(file)
except ValueError:
pass
else:
zip_files.append(file)

for file in tqdm(zip_files, desc='Zip packages'):
pick_from_package(file, dst_dir)

0 comments on commit 1272076

Please sign in to comment.