Skip to content

Commit

Permalink
[SPARK-6191] [EC2] Generalize ability to download libs
Browse files Browse the repository at this point in the history
Right now we have a method to specifically download boto. This PR generalizes it so it's easy to download additional libraries if we want.

For example, adding new external libraries for spark-ec2 is now as simple as:

```python
external_libs = [
    {
         "name": "boto",
         "version": "2.34.0",
         "md5": "5556223d2d0cc4d06dd4829e671dcecd"
    },
    {
        "name": "PyYAML",
        "version": "3.11",
        "md5": "f50e08ef0fe55178479d3a618efe21db"
    },
    {
        "name": "argparse",
        "version": "1.3.0",
        "md5": "9bcf7f612190885c8c85e30ba41db3c7"
    }
]
```
Likely use cases:
* Downloading PyYAML to allow spark-ec2 configs to be persisted as a YAML file. ([SPARK-925](https://issues.apache.org/jira/browse/SPARK-925))
* Downloading argparse to clean up / modernize our option parsing.

First run output, with PyYAML and argparse added just for demonstration purposes:

```shell
$ ./spark-ec2 --version
Downloading external libraries that spark-ec2 needs from PyPI to /path/to/spark/ec2/lib...
This should be a one-time operation.
 - Downloading boto...
 - Finished downloading boto.
 - Downloading PyYAML...
 - Finished downloading PyYAML.
 - Downloading argparse...
 - Finished downloading argparse.
spark-ec2 1.2.1
```

Output thereafter:

```shell
$ ./spark-ec2 --version
spark-ec2 1.2.1
```

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes apache#4919 from nchammas/setup-ec2-libs and squashes the following commits:

a077955 [Nicholas Chammas] print default region
c95fb7d [Nicholas Chammas] to docstring
5448845 [Nicholas Chammas] remove libs added for demo purposes
60d8c23 [Nicholas Chammas] generalize ability to download libs
  • Loading branch information
nchammas authored and srowen committed Mar 10, 2015
1 parent c4c4b07 commit d14df06
Showing 1 changed file with 54 additions and 28 deletions.
82 changes: 54 additions & 28 deletions ec2/spark_ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,34 +70,60 @@
DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"


def setup_boto():
# Download Boto if it's not already present in the SPARK_EC2_DIR/lib folder:
version = "boto-2.34.0"
md5 = "5556223d2d0cc4d06dd4829e671dcecd"
url = "https://pypi.python.org/packages/source/b/boto/%s.tar.gz" % version
lib_dir = os.path.join(SPARK_EC2_DIR, "lib")
if not os.path.exists(lib_dir):
os.mkdir(lib_dir)
boto_lib_dir = os.path.join(lib_dir, version)
if not os.path.isdir(boto_lib_dir):
tgz_file_path = os.path.join(lib_dir, "%s.tar.gz" % version)
print "Downloading Boto from PyPi"
download_stream = urllib2.urlopen(url)
with open(tgz_file_path, "wb") as tgz_file:
tgz_file.write(download_stream.read())
with open(tgz_file_path) as tar:
if hashlib.md5(tar.read()).hexdigest() != md5:
print >> stderr, "ERROR: Got wrong md5sum for Boto"
sys.exit(1)
tar = tarfile.open(tgz_file_path)
tar.extractall(path=lib_dir)
tar.close()
os.remove(tgz_file_path)
print "Finished downloading Boto"
sys.path.insert(0, boto_lib_dir)
def setup_external_libs(libs):
"""
Download external libraries from PyPI to SPARK_EC2_DIR/lib/ and prepend them to our PATH.
"""
PYPI_URL_PREFIX = "https://pypi.python.org/packages/source"
SPARK_EC2_LIB_DIR = os.path.join(SPARK_EC2_DIR, "lib")

if not os.path.exists(SPARK_EC2_LIB_DIR):
print "Downloading external libraries that spark-ec2 needs from PyPI to {path}...".format(
path=SPARK_EC2_LIB_DIR
)
print "This should be a one-time operation."
os.mkdir(SPARK_EC2_LIB_DIR)

for lib in libs:
versioned_lib_name = "{n}-{v}".format(n=lib["name"], v=lib["version"])
lib_dir = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name)

if not os.path.isdir(lib_dir):
tgz_file_path = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name + ".tar.gz")
print " - Downloading {lib}...".format(lib=lib["name"])
download_stream = urllib2.urlopen(
"{prefix}/{first_letter}/{lib_name}/{lib_name}-{lib_version}.tar.gz".format(
prefix=PYPI_URL_PREFIX,
first_letter=lib["name"][:1],
lib_name=lib["name"],
lib_version=lib["version"]
)
)
with open(tgz_file_path, "wb") as tgz_file:
tgz_file.write(download_stream.read())
with open(tgz_file_path) as tar:
if hashlib.md5(tar.read()).hexdigest() != lib["md5"]:
print >> stderr, "ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"])
sys.exit(1)
tar = tarfile.open(tgz_file_path)
tar.extractall(path=SPARK_EC2_LIB_DIR)
tar.close()
os.remove(tgz_file_path)
print " - Finished downloading {lib}.".format(lib=lib["name"])
sys.path.insert(1, lib_dir)


# Only PyPI libraries are supported.
external_libs = [
{
"name": "boto",
"version": "2.34.0",
"md5": "5556223d2d0cc4d06dd4829e671dcecd"
}
]

setup_external_libs(external_libs)

setup_boto()
import boto
from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
from boto import ec2
Expand Down Expand Up @@ -136,7 +162,7 @@ def parse_args():
help="Master instance type (leave empty for same as instance-type)")
parser.add_option(
"-r", "--region", default="us-east-1",
help="EC2 region used to launch instances in, or to find them in")
help="EC2 region used to launch instances in, or to find them in (default: %default)")
parser.add_option(
"-z", "--zone", default="",
help="Availability zone to launch instances in, or 'all' to spread " +
Expand Down Expand Up @@ -230,7 +256,7 @@ def parse_args():
"(e.g -Dspark.worker.timeout=180)")
parser.add_option(
"--user-data", type="string", default="",
help="Path to a user-data file (most AMI's interpret this as an initialization script)")
help="Path to a user-data file (most AMIs interpret this as an initialization script)")
parser.add_option(
"--authorized-address", type="string", default="0.0.0.0/0",
help="Address to authorize on created security groups (default: %default)")
Expand Down

0 comments on commit d14df06

Please sign in to comment.