Skip to content

Commit 7f863c4

Browse files
authored
Merge pull request #26 from scrapinghub/gzip
gzip compression for cache
2 parents 3c8adc3 + bf85321 commit 7f863c4

File tree

3 files changed

+35
-4
lines changed

3 files changed

+35
-4
lines changed

README.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,9 @@ Provider settings
242242
be placed in the ``.scrapy`` folder. File will be created if it doesn't exist.
243243
Cache is useful for development; AutoExtract requests bypass standard Scrapy
244244
cache when providers are used.
245+
- ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract
246+
responses are compressed using gzip. Set this option to False to turn
247+
compression off.
245248

246249
Limitations
247250
===========

scrapy_autoextract/cache.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import abc
22
import json
3+
import gzip
4+
import pickle
5+
import sqlite3
36

47
import sqlitedict
58
from autoextract.request import Request
@@ -39,8 +42,30 @@ def __str__(self):
3942

4043

4144
class AutoExtractCache(_Cache):
42-
def __init__(self, path):
43-
self.db = sqlitedict.SqliteDict(path, autocommit=True)
45+
def __init__(self, path, *, compressed=True):
46+
self.compressed = compressed
47+
tablename = 'responses_gzip' if compressed else 'responses'
48+
self.db = sqlitedict.SqliteDict(path,
49+
tablename=tablename,
50+
autocommit=True,
51+
encode=self.encode,
52+
decode=self.decode)
53+
54+
def encode(self, obj):
55+
# based on sqlitedict.encode
56+
data = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
57+
if self.compressed:
58+
data = gzip.compress(data, compresslevel=3)
59+
return sqlite3.Binary(data)
60+
61+
def decode(self, obj):
62+
# based on sqlitedict.decode
63+
data = bytes(obj)
64+
if self.compressed:
65+
# gzip is slightly less efficient than raw zlib, but it does
66+
# e.g. crc checks out of box
67+
data = gzip.decompress(data)
68+
return pickle.loads(data)
4469

4570
@classmethod
4671
def fingerprint(cls, request: Request) -> str:
@@ -51,7 +76,9 @@ def fingerprint(cls, request: Request) -> str:
5176
)
5277

5378
def __str__(self):
54-
return f"AutoExtractCache <{self.db.filename} | {len(self.db)} records>"
79+
return f"AutoExtractCache <{self.db.filename} | " \
80+
f"compressed: {self.compressed} | " \
81+
f"{len(self.db)} records>"
5582

5683
def __getitem__(self, fingerprint: str):
5784
return self.db[fingerprint]

scrapy_autoextract/providers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ def __init__(self, crawler: Crawler):
9898
if cache_filename:
9999
cache_filename = os.path.join(get_scrapy_data_path(createdir=True),
100100
cache_filename)
101-
self.cache = AutoExtractCache(cache_filename)
101+
compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True)
102+
self.cache = AutoExtractCache(cache_filename, compressed=compressed)
102103
else:
103104
self.cache = DummyCache()
104105

0 commit comments

Comments
 (0)