Skip to content

Commit

Permalink
fix: remove cchardet as a dependency
Browse files Browse the repository at this point in the history
`cchardet` hasn't had a commit in almost two years (8 days shy as
of this commit). [This issue](PyYoshi/cChardet#81)
is a six month old bug showing that `cchardet` doesn't support
python 3.11, which is the most recent python.

`cchardet` is fast, certainly, though the usage here is not as
performance sensitive as would warrant its use over `chardet`,
e.g. a difference of 800ms is probably not something that is worth
keeping problematic dependencies around.
  • Loading branch information
rockstar committed Apr 20, 2023
1 parent 733d025 commit 3fc53cc
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 15 deletions.
1 change: 0 additions & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ pyflakes==2.4.0
pylama==8.3.8
pylint==2.17.2
twine==4.0.1
cchardet==2.1.7
chardet==5.1.0
flair==0.12.2
psutil==5.9.2
Expand Down
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ license_files =
package_dir =
= src
install_requires =
cchardet>=2.1.7
chardet>=5.1.0
flair>=0.11.3
psutil>=5.9.2
Expand Down
18 changes: 5 additions & 13 deletions src/watchful/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import urllib
from typing import Callable, Dict, Generator, List, Literal, Optional, Union
from uuid import uuid4

import chardet
import requests


Expand Down Expand Up @@ -1172,7 +1174,6 @@ def is_utf8(
csv_bytes: bytes = None,
filepath: str = None,
threshold: float = 0.5,
is_fast: bool = True,
) -> bool:
"""
This function attempts to detect if the encoding of the given bytes or the
Expand All @@ -1191,7 +1192,6 @@ def is_utf8(
:type threshold: float, optional
:param is_fast: Whether to use fast encoding detection with a lower
accuracy, or not.
:type is_fast: bool, optional
:return: `True` if the detected encoding is utf-8 and has a confidence of
the given threshold or more, otherwise `False`.
:rtype: bool
Expand All @@ -1207,11 +1207,6 @@ def is_utf8(
"Only one of them needs to be specified."
)

if is_fast:
import cchardet as chardet
else:
import chardet

if csv_bytes:
res = chardet.detect(csv_bytes)
else:
Expand Down Expand Up @@ -1239,7 +1234,7 @@ def create_dataset(
filename: str = "none",
has_header: bool = True,
threshold_detect: float = 0.5,
is_fast_detect: bool = True,
is_fast_detect: bool = True, # pylint: disable=W0613
force_load: bool = True,
) -> str:
"""
Expand All @@ -1259,8 +1254,7 @@ def create_dataset(
:param threshold_detect: The minimum confidence required to accept the
detected encoding.
:type threshold_detect: float, optional
:param is_fast_detect: Whether to use fast encoding detection with a lower
accuracy, or not.
:param is_fast_detect: No longer used, but remains for API compatibility
:type is_fast_detect: bool, optional
:param force_load: The boolean indicating if the csv dataset will be loaded
even when its encoding is detected to be non-utf-8, defaults to True.
Expand All @@ -1274,9 +1268,7 @@ def create_dataset(
TODO: Add error handling.
"""

is_csv_bytes_utf8 = is_utf8(
csv_bytes, None, threshold_detect, is_fast_detect
)
is_csv_bytes_utf8 = is_utf8(csv_bytes, None, threshold_detect)

if is_csv_bytes_utf8 or force_load:
id_ = str(uuid4())
Expand Down

0 comments on commit 3fc53cc

Please sign in to comment.