Merge pull request #26 from scrapinghub/gzip

ivanprado · web-flow · commit 7f863c46c62e · 2021-08-23T10:56:17.000+01:00
gzip compression for cache
diff --git a/README.rst b/README.rst
@@ -242,6 +242,9 @@ Provider settings
   be placed in the ``.scrapy`` folder. File will be created if it doesn't exist.
   Cache is useful for development; AutoExtract requests bypass standard Scrapy
   cache when providers are used.
+- ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract
+  responses are compressed using gzip. Set this option to False to turn
+  compression off.
 
 Limitations
 ===========
diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py
@@ -1,5 +1,8 @@
 import abc
 import json
+import gzip
+import pickle
+import sqlite3
 
 import sqlitedict
 from autoextract.request import Request
@@ -39,8 +42,30 @@ def __str__(self):
 
 
 class AutoExtractCache(_Cache):
-    def __init__(self, path):
-        self.db = sqlitedict.SqliteDict(path, autocommit=True)
+    def __init__(self, path, *, compressed=True):
+        self.compressed = compressed
+        tablename = 'responses_gzip' if compressed else 'responses'
+        self.db = sqlitedict.SqliteDict(path,
+                                        tablename=tablename,
+                                        autocommit=True,
+                                        encode=self.encode,
+                                        decode=self.decode)
+
+    def encode(self, obj):
+        # based on sqlitedict.encode
+        data = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
+        if self.compressed:
+            data = gzip.compress(data, compresslevel=3)
+        return sqlite3.Binary(data)
+
+    def decode(self, obj):
+        # based on sqlitedict.decode
+        data = bytes(obj)
+        if self.compressed:
+            # gzip is slightly less efficient than raw zlib, but it does
+            # e.g. crc checks out of box
+            data = gzip.decompress(data)
+        return pickle.loads(data)
 
     @classmethod
     def fingerprint(cls, request: Request) -> str:
@@ -51,7 +76,9 @@ def fingerprint(cls, request: Request) -> str:
         )
 
     def __str__(self):
-        return f"AutoExtractCache <{self.db.filename} | {len(self.db)} records>"
+        return f"AutoExtractCache <{self.db.filename} | " \
+               f"compressed: {self.compressed} | " \
+               f"{len(self.db)} records>"
 
     def __getitem__(self, fingerprint: str):
         return self.db[fingerprint]
diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
@@ -98,7 +98,8 @@ def __init__(self, crawler: Crawler):
         if cache_filename:
             cache_filename = os.path.join(get_scrapy_data_path(createdir=True),
                                           cache_filename)
-            self.cache = AutoExtractCache(cache_filename)
+            compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True)
+            self.cache = AutoExtractCache(cache_filename, compressed=compressed)
         else:
             self.cache = DummyCache()