diff --git a/.env b/.env deleted file mode 100644 index a559e65d2..000000000 --- a/.env +++ /dev/null @@ -1,5 +0,0 @@ -COUCHDB_USER=user -COUCHDB_PASSWORD=password -COUCHDB_NAME=couchdb -COUCHDB_PORT_5984_TCP_ADDR=couchdb -COUCHDB_PORT_5984_TCP_PORT=5984 \ No newline at end of file diff --git a/README.md b/README.md index 9dfb20dca..102924a60 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]** - Write script in Python - Powerful WebUI with script editor, task monitor, project manager and result viewer -- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend +- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc... diff --git a/docker-compose.yaml b/docker-compose.yaml index 3b89ed19d..efdfa5678 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,6 +1,7 @@ version: "3.7" # docker build ./ -t pyspider:latest +# replace /path/to/dir/ to point to config_example.json services: rabbitmq: @@ -16,8 +17,12 @@ services: - pyspider ports: - "5984:5984" - env_file: .env - + environment: + - COUCHDB_NAME=couchdb + - COUCHDB_USER=user + - COUCHDB_PASSWORD=password + - COUCHDB_HTTPS=true + # OR we can replace couchdb with mysql #mysql: # image: mysql:latest # container_name: mysql @@ -27,15 +32,13 @@ services: # - MYSQL_ALLOW_EMPTY_PASSWORD=yes # networks: # - pyspider - # env_file: .env phantomjs: image: pyspider:latest container_name: phantomjs networks: - pyspider - env_file: .env volumes: - - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json + - /path/to/dir/config_example.json:/opt/pyspider/config.json command: -c config.json phantomjs depends_on: - couchdb @@ -46,9 +49,8 @@ services: container_name: result networks: - pyspider - env_file: .env volumes: - - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json + - /path/to/dir/config_example.json:/opt/pyspider/config.json command: -c config.json result_worker depends_on: - couchdb @@ -59,9 +61,8 @@ services: image: pyspider:latest networks: - pyspider - env_file: .env volumes: - - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json + - /path/to/dir/config_example.json:/opt/pyspider/config.json command: -c config.json processor depends_on: - couchdb @@ -72,9 +73,8 @@ services: container_name: fetcher networks: - pyspider - env_file: .env volumes: - - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json + - /path/to/dir/config_example.json:/opt/pyspider/config.json command : -c config.json fetcher depends_on: - couchdb @@ -85,9 +85,8 @@ services: container_name: scheduler networks: - pyspider - env_file: .env volumes: - - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json + - /path/to/dir/config_example.json:/opt/pyspider/config.json command: -c config.json scheduler depends_on: - couchdb @@ -100,9 +99,8 @@ services: - "5050:5000" networks: - pyspider - env_file: .env volumes: - - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json + - /path/to/dir/config_example.json:/opt/pyspider/config.json environment: - SCHEDULER_PORT_23333_TCP_ADDR=scheduler command: -c config.json webui diff --git a/docs/Command-Line.md b/docs/Command-Line.md index f06bcafce..8dca83f1f 100644 --- a/docs/Command-Line.md +++ b/docs/Command-Line.md @@ -72,6 +72,8 @@ sqlite: mongodb: mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]] more: http://docs.mongodb.org/manual/reference/connection-string/ +couchdb: + couchdb+type://[username:password@]host[:port] sqlalchemy: sqlalchemy+postgresql+type://user:passwd@host:port/database sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database diff --git a/docs/Deployment.md b/docs/Deployment.md index 304ad6427..84ca97534 100644 --- a/docs/Deployment.md +++ b/docs/Deployment.md @@ -8,7 +8,7 @@ To deploy pyspider in product environment, running component in each process and Installation ------------ -To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them. +To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them. And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue. @@ -63,6 +63,8 @@ sqlite: mongodb: mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]] more: http://docs.mongodb.org/manual/reference/connection-string/ +couchdb: + couchdb+type://[username:password@]host[:port][?options]] sqlalchemy: sqlalchemy+postgresql+type://user:passwd@host:port/database sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database diff --git a/docs/index.md b/docs/index.md index ff0d47eb2..5c4bd6f10 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]** - Write script in Python - Powerful WebUI with script editor, task monitor, project manager and result viewer -- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend +- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2&3, etc... diff --git a/pyspider/database/__init__.py b/pyspider/database/__init__.py index 31c7e9f34..65c658677 100644 --- a/pyspider/database/__init__.py +++ b/pyspider/database/__init__.py @@ -34,7 +34,7 @@ def connect_database(url): elasticsearch: elasticsearch+type://host:port/?index=pyspider couchdb: - couchdb+type://host[:port] + couchdb+type://[username:password@]host[:port] local: local+projectdb://filepath,filepath @@ -207,14 +207,29 @@ def _connect_elasticsearch(parsed, dbtype): def _connect_couchdb(parsed, dbtype, url): - # TODO: Add https + auth as parameters - url = "http://" + parsed.netloc + "/" + if os.environ.get('COUCHDB_HTTPS'): + url = "https://" + parsed.netloc + "/" + else: + url = "http://" + parsed.netloc + "/" params = {} - params['username'] = os.environ.get('COUCHDB_USER') or 'user' - params['password'] = os.environ.get('COUCHDB_PASSWORD') or 'password' + username = None + password = None + if '@' in parsed.netloc: + # netloc looks like: 'user:pass@couchdb:999' + url = parsed.netloc[parsed.netloc.find("@")+1:] + # extract the username and password + username = parsed.netloc[:parsed.netloc.find(":")] + password = parsed.netloc[parsed.netloc.find(":")+1:parsed.netloc.find("@")] + + # default to env, then url, then hard coded + params['username'] = os.environ.get('COUCHDB_USER') or username or 'user' + params['password'] = os.environ.get('COUCHDB_PASSWORD') or password or 'password' + + # create required CouchDB databases if not already present requests.put(url+"_users") requests.put(url+"_replicator") + # create the admin user # NOTE: Over docker, this user is already created when COUCHDB_USER and COUCHDB_PASSWORD are set requests.put(url+'_node/_local/_config/admins/'+ params['username'], diff --git a/pyspider/run.py b/pyspider/run.py index 376032218..fd3603523 100755 --- a/pyspider/run.py +++ b/pyspider/run.py @@ -114,8 +114,10 @@ def cli(ctx, **kwargs): elif os.environ.get('COUCHDB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'couchdb+%s://%s:%s/%s' % ( - db, os.environ['COUCHDB_PORT_5984_TCP_ADDR'], - os.environ['COUCHDB_PORT_5984_TCP_PORT'], db))) + db, + os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb', + os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984', + db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench'