Skip to content

Commit ccf2b08

Browse files
kirtiv1Dima
authored andcommitted
Support for installation from a branch of StreamSets Data Collector (#21)
* Support for installation from a branch of StreamSets Data Collector * Install StreamSets DataCollector parcel using a local parcel repository * Work on review comments, make arguments same across topology.yaml and build_images script * Remove arguments sdc-build, sdc-s3-bucket etc. and just keep sdc-version
1 parent ae33812 commit ccf2b08

File tree

3 files changed

+36
-46
lines changed

3 files changed

+36
-46
lines changed

images/build_images

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,12 @@ def main():
3636
parser = argparse.ArgumentParser(description=('Build Docker images with StreamSets '
3737
'Data Collector parcels for clusterdock'),
3838
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
39-
parser.add_argument('--build',
40-
help='The build to use from S3 (e.g. of the form "2038," "2.4," "latest")',
39+
parser.add_argument('--sdc-build',
40+
help='The build from which to install SDC(e.g. of the form "2038," "2.4," "latest")',
4141
default=DEFAULT_BUILD)
4242
parser.add_argument('--dry-run', help="Don't actually do the `docker build`", action='store_true')
43-
parser.add_argument('--s3-bucket',
44-
help='S3 bucket to get parcels from',
43+
parser.add_argument('--sdc-s3-bucket',
44+
help='S3 bucket to get SDC parcels from',
4545
default=DEFAULT_S3_BUCKET)
4646
parser.add_argument('--sdc-version-tag',
4747
help='A tag to use for images instead of the SDC version gleamed from '
@@ -55,10 +55,10 @@ def main():
5555
if args.dry_run:
5656
logger.info('Doing dry-run of tool ...')
5757

58-
sdc_version = _get_sdc_version(PARCEL_MANIFEST_URL_TEMPLATE.format(build=args.build, s3_bucket=args.s3_bucket))
58+
sdc_version = _get_sdc_version(PARCEL_MANIFEST_URL_TEMPLATE.format(build=args.sdc_build, s3_bucket=args.sdc_s3_bucket))
5959
image_name = IMAGE_NAME_TEMPLATE.format(args.sdc_version_tag or sdc_version)
60-
csd_url = CSD_URL_TEMPLATE.format(sdc_version, build=args.build, s3_bucket=args.s3_bucket)
61-
parcel_url = PARCEL_URL_TEMPLATE.format(sdc_version, build=args.build, s3_bucket=args.s3_bucket)
60+
csd_url = CSD_URL_TEMPLATE.format(sdc_version, build=args.sdc_build, s3_bucket=args.sdc_s3_bucket)
61+
parcel_url = PARCEL_URL_TEMPLATE.format(sdc_version, build=args.sdc_build, s3_bucket=args.sdc_s3_bucket)
6262

6363
cmd_elements = ['docker build -t {}'.format(image_name),
6464
'--build-arg CSD_URL={}'.format(csd_url),

images/sdc/Dockerfile

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,24 @@ FROM alpine:latest
1313
MAINTAINER Dima Spivak <dima@spivak.ch>
1414

1515
ARG CSD_DIRECTORY=/opt/cloudera/csd
16-
ARG CSD_URL=http://archives.streamsets.com/datacollector/2.7.2.0/csd/STREAMSETS-2.7.2.0.jar
16+
ARG CSD_URL=http://archives.streamsets.com/datacollector/3.1.0.0/csd/STREAMSETS-3.1.0.0.jar
1717

18-
ARG PARCEL_DIRECTORY=/opt/cloudera/parcels/STREAMSETS_DATACOLLECTOR-2.7.2.0
19-
ARG PARCEL_URL=http://archives.streamsets.com/datacollector/2.7.2.0/parcel/STREAMSETS_DATACOLLECTOR-2.7.2.0-el6.parcel
18+
ARG PARCEL_REPO_DIRECTORY=/opt/cloudera/parcel-repo
19+
ARG PARCEL_URL=http://archives.streamsets.com/datacollector/3.1.0.0/parcel/STREAMSETS_DATACOLLECTOR-3.1.0.0-el6.parcel
2020

2121
RUN apk --no-cache add tar
2222

23-
RUN wget -O /parcel.tgz ${PARCEL_URL} && \
24-
mkdir -p "$(dirname ${PARCEL_DIRECTORY})" && \
25-
tar xf /parcel.tgz -C "$(dirname ${PARCEL_DIRECTORY})" && \
26-
rm /parcel.tgz
23+
RUN mkdir -p ${PARCEL_REPO_DIRECTORY} && \
24+
wget -P ${PARCEL_REPO_DIRECTORY} ${PARCEL_URL} && \
25+
PARCEL_NAME=$(basename ${PARCEL_URL}) && \
26+
cd ${PARCEL_REPO_DIRECTORY} && \
27+
sha1sum ${PARCEL_NAME} | awk '{ print $1 }' > ${PARCEL_NAME}.sha
2728

28-
# See http://community.cloudera.com/t5/Cloudera-Manager-Installation/Stop-CM-undistributing-my-parcel/m-p/61402.
29-
RUN touch "${PARCEL_DIRECTORY}/.dont_delete"
30-
31-
VOLUME ${PARCEL_DIRECTORY}
29+
VOLUME ${PARCEL_REPO_DIRECTORY}
3230

3331
RUN mkdir -p "${CSD_DIRECTORY}" && \
34-
wget -O "${CSD_DIRECTORY}/$(basename ${CSD_URL})" "${CSD_URL}"
32+
CSD_NAME=$(basename ${CSD_URL}) && \
33+
wget -O "${CSD_DIRECTORY}/${CSD_NAME}" "${CSD_URL}"
3534

3635
VOLUME ${CSD_DIRECTORY}
3736

start.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
'1.3.0': '5.11.0',
6363
'1.4.0': '5.12.0'}
6464

65-
SDC_PARCEL_REPO_URL = 'https://archives.streamsets.com/datacollector/{}/parcel/'
65+
DEFAULT_SDC_S3_BUCKET = 'archives.streamsets.com'
6666
SDC_PORT = 18630
6767

6868
logger = logging.getLogger('clusterdock.{}'.format(__name__))
@@ -115,6 +115,7 @@ def main(args):
115115
node.volumes.append(spark2_parcel_image)
116116

117117
if args.sdc_version:
118+
logger.info('args.sdc_version = %s', args.sdc_version)
118119
sdc_parcel_image = ('{}/{}/clusterdock:topology_cdh-'
119120
'streamsets_datacollector-{}').format(args.registry,
120121
args.namespace
@@ -256,33 +257,23 @@ def cm_server_not_dead(primary_node):
256257
_install_service_from_local_repo(deployment, cluster, product='SPARK2', prefix='SPARK2')
257258

258259
if args.sdc_version:
260+
# We install StreamSets DataCollector using local repo /opt/cloudera/parcel-repo.
261+
# Set file and folder permissions correctly.
262+
commands = ['chown cloudera-scm:cloudera-scm /opt/cloudera/csd',
263+
'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo',
264+
'chown cloudera-scm:cloudera-scm /opt/cloudera/csd/STREAMSETS*.jar',
265+
'chmod 644 /opt/cloudera/csd/STREAMSETS*.jar',
266+
'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo/STREAMSETS_*']
267+
primary_node.execute(' && '.join(commands))
268+
269+
# The parcel is already present. Hence just distribute and activate it after refresing parcel repos.
259270
product = 'STREAMSETS_DATACOLLECTOR'
271+
deployment.refresh_parcel_repos()
272+
deployment.cluster(DEFAULT_CLUSTER_NAME).parcels
260273
# Remove RC from version.
261274
version = args.sdc_version.rsplit('-RC')[0]
262-
sdc_parcel = cm_cluster.parcel(product=product, version=version)
263-
264-
# After we set CM's "Manage Parcels" config to False, the SDC parcel becomes
265-
# undistributed. After this, we may need to add the SDC parcel repo URL in order
266-
# to be able to re-activate it.
267-
try:
268-
sdc_parcel.wait_for_stage('AVAILABLE_REMOTELY')
269-
except cm.ParcelNotFoundError:
270-
for config in deployment.get_cm_config():
271-
if config['name'] == 'REMOTE_PARCEL_REPO_URLS':
272-
break
273-
else:
274-
raise Exception('Failed to find remote parcel repo URLs configuration.')
275-
parcel_repo_urls = config['value']
276-
277-
sdc_parcel_repo_url = SDC_PARCEL_REPO_URL.format(args.sdc_version)
278-
logger.debug('Adding SDC parcel repo URL (%s) ...', sdc_parcel_repo_url)
279-
remote_parcel_repo_urls = '{},{}'.format(parcel_repo_urls, sdc_parcel_repo_url)
280-
deployment.update_cm_config({'REMOTE_PARCEL_REPO_URLS': remote_parcel_repo_urls})
281-
282-
logger.debug('Refreshing parcel repos ...')
283-
deployment.refresh_parcel_repos()
284-
285-
sdc_parcel.download().distribute().activate()
275+
sdc_parcel = cm_cluster.parcel(product=product, version=version, stage='DOWNLOADED')
276+
sdc_parcel.distribute().activate()
286277

287278
if args.include_services:
288279
if args.exclude_services:
@@ -347,7 +338,7 @@ def cm_server_not_dead(primary_node):
347338

348339
if args.sdc_version:
349340
logger.info('Configuring StreamSets Data Collector ...')
350-
_configure_sdc(deployment, cluster, sdc_version=args.sdc_version, is_kerberos_enabled=args.kerberos)
341+
_configure_sdc(deployment, cluster, is_kerberos_enabled=args.kerberos)
351342

352343
if args.kerberos:
353344
logger.info('Configure Cloudera Manager for Kerberos ...')
@@ -778,7 +769,7 @@ def _configure_kudu(deployment, cluster, kudu_version):
778769
configs)
779770

780771

781-
def _configure_sdc(deployment, cluster, sdc_version, is_kerberos_enabled):
772+
def _configure_sdc(deployment, cluster, is_kerberos_enabled):
782773
logger.info('Adding StreamSets service to cluster (%s) ...', DEFAULT_CLUSTER_NAME)
783774
datacollector_role = {'type': 'DATACOLLECTOR',
784775
'hostRef': {'hostId': cluster.primary_node.host_id}}

0 commit comments

Comments
 (0)