Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions images/build_images
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ def main():
parser = argparse.ArgumentParser(description=('Build Docker images with StreamSets '
'Data Collector parcels for clusterdock'),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--build',
help='The build to use from S3 (e.g. of the form "2038," "2.4," "latest")',
parser.add_argument('--sdc-build',
help='The build from which to install SDC(e.g. of the form "2038," "2.4," "latest")',
default=DEFAULT_BUILD)
parser.add_argument('--dry-run', help="Don't actually do the `docker build`", action='store_true')
parser.add_argument('--s3-bucket',
help='S3 bucket to get parcels from',
parser.add_argument('--sdc-s3-bucket',
help='S3 bucket to get SDC parcels from',
default=DEFAULT_S3_BUCKET)
parser.add_argument('--sdc-version-tag',
help='A tag to use for images instead of the SDC version gleamed from '
Expand All @@ -55,10 +55,10 @@ def main():
if args.dry_run:
logger.info('Doing dry-run of tool ...')

sdc_version = _get_sdc_version(PARCEL_MANIFEST_URL_TEMPLATE.format(build=args.build, s3_bucket=args.s3_bucket))
sdc_version = _get_sdc_version(PARCEL_MANIFEST_URL_TEMPLATE.format(build=args.sdc_build, s3_bucket=args.sdc_s3_bucket))
image_name = IMAGE_NAME_TEMPLATE.format(args.sdc_version_tag or sdc_version)
csd_url = CSD_URL_TEMPLATE.format(sdc_version, build=args.build, s3_bucket=args.s3_bucket)
parcel_url = PARCEL_URL_TEMPLATE.format(sdc_version, build=args.build, s3_bucket=args.s3_bucket)
csd_url = CSD_URL_TEMPLATE.format(sdc_version, build=args.sdc_build, s3_bucket=args.sdc_s3_bucket)
parcel_url = PARCEL_URL_TEMPLATE.format(sdc_version, build=args.sdc_build, s3_bucket=args.sdc_s3_bucket)

cmd_elements = ['docker build -t {}'.format(image_name),
'--build-arg CSD_URL={}'.format(csd_url),
Expand Down
23 changes: 11 additions & 12 deletions images/sdc/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,24 @@ FROM alpine:latest
MAINTAINER Dima Spivak <dima@spivak.ch>

ARG CSD_DIRECTORY=/opt/cloudera/csd
ARG CSD_URL=http://archives.streamsets.com/datacollector/2.7.2.0/csd/STREAMSETS-2.7.2.0.jar
ARG CSD_URL=http://archives.streamsets.com/datacollector/3.1.0.0/csd/STREAMSETS-3.1.0.0.jar

ARG PARCEL_DIRECTORY=/opt/cloudera/parcels/STREAMSETS_DATACOLLECTOR-2.7.2.0
ARG PARCEL_URL=http://archives.streamsets.com/datacollector/2.7.2.0/parcel/STREAMSETS_DATACOLLECTOR-2.7.2.0-el6.parcel
ARG PARCEL_REPO_DIRECTORY=/opt/cloudera/parcel-repo
ARG PARCEL_URL=http://archives.streamsets.com/datacollector/3.1.0.0/parcel/STREAMSETS_DATACOLLECTOR-3.1.0.0-el6.parcel

RUN apk --no-cache add tar

RUN wget -O /parcel.tgz ${PARCEL_URL} && \
mkdir -p "$(dirname ${PARCEL_DIRECTORY})" && \
tar xf /parcel.tgz -C "$(dirname ${PARCEL_DIRECTORY})" && \
rm /parcel.tgz
RUN mkdir -p ${PARCEL_REPO_DIRECTORY} && \
wget -P ${PARCEL_REPO_DIRECTORY} ${PARCEL_URL} && \
PARCEL_NAME=$(basename ${PARCEL_URL}) && \
cd ${PARCEL_REPO_DIRECTORY} && \
sha1sum ${PARCEL_NAME} | awk '{ print $1 }' > ${PARCEL_NAME}.sha

# See http://community.cloudera.com/t5/Cloudera-Manager-Installation/Stop-CM-undistributing-my-parcel/m-p/61402.
RUN touch "${PARCEL_DIRECTORY}/.dont_delete"

VOLUME ${PARCEL_DIRECTORY}
VOLUME ${PARCEL_REPO_DIRECTORY}

RUN mkdir -p "${CSD_DIRECTORY}" && \
wget -O "${CSD_DIRECTORY}/$(basename ${CSD_URL})" "${CSD_URL}"
CSD_NAME=$(basename ${CSD_URL}) && \
wget -O "${CSD_DIRECTORY}/${CSD_NAME}" "${CSD_URL}"

VOLUME ${CSD_DIRECTORY}

Expand Down
46 changes: 19 additions & 27 deletions start.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
'1.3.0': '5.11.0',
'1.4.0': '5.12.0'}

SDC_PARCEL_REPO_URL = 'https://archives.streamsets.com/datacollector/{}/parcel/'
DEFAULT_SDC_S3_BUCKET = 'archives.streamsets.com'
SDC_PORT = 18630

logger = logging.getLogger('clusterdock.{}'.format(__name__))
Expand Down Expand Up @@ -105,6 +105,7 @@ def main(args):
node.volumes.append(java_image)

if args.sdc_version:
logger.info('args.sdc_version = %s', args.sdc_version)
sdc_parcel_image = ('{}/{}/clusterdock:topology_cdh-'
'streamsets_datacollector-{}').format(args.registry,
args.namespace
Expand Down Expand Up @@ -243,33 +244,23 @@ def cm_server_not_dead(primary_node):
deployment.update_cm_config(configs={'manages_parcels': True})

if args.sdc_version:
# We install StreamSets DataCollector using local repo /opt/cloudera/parcel-repo.
# Set file and folder permissions correctly.
commands = ['chown cloudera-scm:cloudera-scm /opt/cloudera/csd',
'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo',
'chown cloudera-scm:cloudera-scm /opt/cloudera/csd/STREAMSETS*.jar',
'chmod 644 /opt/cloudera/csd/STREAMSETS*.jar',
'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo/STREAMSETS_*']
primary_node.execute(' && '.join(commands))

# The parcel is already present. Hence just distribute and activate it after refresing parcel repos.
product = 'STREAMSETS_DATACOLLECTOR'
deployment.refresh_parcel_repos()
deployment.cluster(DEFAULT_CLUSTER_NAME).parcels
# Remove RC from version.
version = args.sdc_version.rsplit('-RC')[0]
sdc_parcel = cm_cluster.parcel(product=product, version=version)

# After we set CM's "Manage Parcels" config to False, the SDC parcel becomes
# undistributed. After this, we may need to add the SDC parcel repo URL in order
# to be able to re-activate it.
try:
sdc_parcel.wait_for_stage('AVAILABLE_REMOTELY')
except cm.ParcelNotFoundError:
for config in deployment.get_cm_config():
if config['name'] == 'REMOTE_PARCEL_REPO_URLS':
break
else:
raise Exception('Failed to find remote parcel repo URLs configuration.')
parcel_repo_urls = config['value']

sdc_parcel_repo_url = SDC_PARCEL_REPO_URL.format(args.sdc_version)
logger.debug('Adding SDC parcel repo URL (%s) ...', sdc_parcel_repo_url)
remote_parcel_repo_urls = '{},{}'.format(parcel_repo_urls, sdc_parcel_repo_url)
deployment.update_cm_config({'REMOTE_PARCEL_REPO_URLS': remote_parcel_repo_urls})

logger.debug('Refreshing parcel repos ...')
deployment.refresh_parcel_repos()

sdc_parcel.download().distribute().activate()
sdc_parcel = cm_cluster.parcel(product=product, version=version, stage='DOWNLOADED')
sdc_parcel.distribute().activate()

if args.include_services:
if args.exclude_services:
Expand Down Expand Up @@ -330,7 +321,7 @@ def cm_server_not_dead(primary_node):

if args.sdc_version:
logger.info('Configuring StreamSets Data Collector ...')
_configure_sdc(deployment, cluster, sdc_version=args.sdc_version, is_kerberos_enabled=args.kerberos)
_configure_sdc(deployment, cluster, is_kerberos_enabled=args.kerberos)

if args.kerberos:
logger.info('Configure Cloudera Manager for Kerberos ...')
Expand All @@ -353,6 +344,7 @@ def cm_server_not_dead(primary_node):
cluster_name=DEFAULT_CLUSTER_NAME,
cluster=cluster, quiet=not args.verbose)


def _configure_kdc(cluster, kerberos_principals, quiet):
kdc_node = cluster.kdc_node

Expand Down Expand Up @@ -684,7 +676,7 @@ def _configure_kudu(deployment, cluster, kudu_version):
configs)


def _configure_sdc(deployment, cluster, sdc_version, is_kerberos_enabled):
def _configure_sdc(deployment, cluster, is_kerberos_enabled):
logger.info('Adding StreamSets service to cluster (%s) ...', DEFAULT_CLUSTER_NAME)
datacollector_role = {'type': 'DATACOLLECTOR',
'hostRef': {'hostId': cluster.primary_node.host_id}}
Expand Down