Skip to content

Commit

Permalink
Merge pull request #17 from deepghs/dev/safebooru
Browse files Browse the repository at this point in the history
dev(narugo): add support for safebooru
  • Loading branch information
narugo1992 authored Dec 12, 2024
2 parents 5be5c5b + ef58632 commit 7dc1730
Show file tree
Hide file tree
Showing 13 changed files with 155 additions and 0 deletions.
1 change: 1 addition & 0 deletions cheesechaser/datapool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .nozomi import NozomiDataPool
from .realbooru import RealbooruDataPool
from .rule34 import Rule34DataPool, Rule34WebpDataPool
from .safebooru import SafebooruDataPool, SafebooruWebpDataPool
from .table import TableBasedHfDataPool, SimpleTableHfDataPool
from .threedbooru import ThreedbooruDataPool
from .yande import YandeDataPool, YandeWebpDataPool
Expand Down
99 changes: 99 additions & 0 deletions cheesechaser/datapool/safebooru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
This module provides data pool classes for accessing Safebooru image data.
It contains two classes:
1. SafebooruDataPool: For accessing the full Safebooru dataset.
2. SafebooruWebpDataPool: For accessing the WebP-formatted Safebooru dataset with 4M pixel images.
Both classes inherit from IncrementIDDataPool and provide easy access to the respective datasets
stored in Hugging Face repositories. These classes simplify the process of retrieving and working
with Safebooru image data, allowing users to easily integrate this data into their projects or
research.
.. note::
The datasets `deepghs/safebooru_full <https://huggingface.co/datasets/deepghs/safebooru_full>`_ and
`deepghs/safebooru-webp-4Mpixel <https://huggingface.co/datasets/deepghs/safebooru-webp-4Mpixel>`_
is gated, you have to get the access of it before using this module.
"""

from typing import Optional

from .base import IncrementIDDataPool

_GELBOORU_REPO = 'deepghs/safebooru_full'


class SafebooruDataPool(IncrementIDDataPool):
"""
A data pool class for accessing the full Safebooru dataset.
This class inherits from IncrementIDDataPool and is configured to access
the full Safebooru dataset stored in the 'deepghs/safebooru_full' repository.
It provides methods to retrieve image data based on image IDs.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
Note:
This class uses a base level of 4 for file organization, which means
the images are stored in a directory structure with 4 levels of subdirectories.
"""

def __init__(self, revision: str = 'main'):
"""
Initialize the SafebooruDataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
"""
IncrementIDDataPool.__init__(
self,
data_repo_id=_GELBOORU_REPO,
data_revision=revision,
idx_repo_id=_GELBOORU_REPO,
idx_revision=revision,
base_level=[3, 4],
)


_GELBOORU_WEBP_REPO = 'deepghs/safebooru-webp-4Mpixel'


class SafebooruWebpDataPool(IncrementIDDataPool):
"""
A data pool class for accessing the WebP-formatted Safebooru dataset with 4M pixel images.
This class inherits from IncrementIDDataPool and is configured to access
the WebP-formatted Safebooru dataset stored in the 'deepghs/safebooru-webp-4Mpixel' repository.
It provides methods to retrieve WebP-formatted image data based on image IDs.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
Note:
This class uses a base level of 3 for file organization, which means
the images are stored in a directory structure with 3 levels of subdirectories.
Authentication may be required to access this dataset.
"""

def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
"""
Initialize the SafebooruWebpDataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
"""
IncrementIDDataPool.__init__(
self,
data_repo_id=_GELBOORU_WEBP_REPO,
data_revision=revision,
idx_repo_id=_GELBOORU_WEBP_REPO,
idx_revision=revision,
base_level=3,
hf_token=hf_token,
)
1 change: 1 addition & 0 deletions docs/source/api_doc/datapool/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ cheesechaser.datapool
nozomi
realbooru
rule34
safebooru
table
threedbooru
yande
Expand Down
24 changes: 24 additions & 0 deletions docs/source/api_doc/datapool/safebooru.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
cheesechaser.datapool.safebooru
========================================================

.. currentmodule:: cheesechaser.datapool.safebooru

.. automodule:: cheesechaser.datapool.safebooru


SafebooruDataPool
-----------------------------------------------------

.. autoclass:: SafebooruDataPool
:members: __doc__,__init__,__module__



SafebooruWebpDataPool
-----------------------------------------------------

.. autoclass:: SafebooruWebpDataPool
:members: __doc__,__init__,__module__



30 changes: 30 additions & 0 deletions test/datapool/test_safebooru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pytest
from hbutils.testing import isolated_directory

from cheesechaser.datapool import SafebooruWebpDataPool, SafebooruDataPool
from ..testings import get_testfile, dir_compare


@pytest.mark.unittest
class TestDatapoolSafebooru:
def test_safebooru_origin(self):
with isolated_directory():
pool = SafebooruDataPool()
# 4000084 not exist
pool.batch_download_to_directory(
resource_ids=[4000000, 4000001, 4000002, 4000003, 4000084],
dst_dir='.',
)

dir_compare('.', get_testfile('safebooru_5'))

def test_safebooru_webp(self):
with isolated_directory():
pool = SafebooruWebpDataPool()
# 4000084 not exist
pool.batch_download_to_directory(
resource_ids=[4000000, 4000001, 4000002, 4000003, 4000084],
dst_dir='.',
)

dir_compare('.', get_testfile('safebooru_webp_5'))
Binary file added test/testfile/safebooru_5/4000000.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/safebooru_5/4000001.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/safebooru_5/4000002.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/safebooru_5/4000003.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/safebooru_webp_5/4000000.webp
Binary file not shown.
Binary file added test/testfile/safebooru_webp_5/4000001.webp
Binary file not shown.
Binary file added test/testfile/safebooru_webp_5/4000002.webp
Binary file not shown.
Binary file added test/testfile/safebooru_webp_5/4000003.webp
Binary file not shown.

0 comments on commit 7dc1730

Please sign in to comment.