diff --git a/.gitignore b/.gitignore index 4a4ab0ed43..d92e1820b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,2 @@ -.idea/ -.manager/ -workspace.xml -*.iml -*.sublime-* -.DS_Store -*~ -*.swp +# Only project- and language-specific ignores in here. Use global .gitignore for editors etc +.*.yml diff --git a/hive/etl/config.yml b/hive/etl/config.yml index 60681e9eb3..e473c2fa9e 100644 --- a/hive/etl/config.yml +++ b/hive/etl/config.yml @@ -1,6 +1,7 @@ aws: access_key_id: ADD HERE secret_access_key: ADD HERE + emr_client_path: ADD HERE buckets: jar: ADD HERE in: ADD HERE diff --git a/hive/etl/daily-etl.rb b/hive/etl/daily-etl.rb index 1c1dca7c8d..c4dc21ff41 100755 --- a/hive/etl/daily-etl.rb +++ b/hive/etl/daily-etl.rb @@ -66,6 +66,39 @@ # Determine yesterday's date yesterday = (Date.today - 1).strftime('%Y-%m-%d') +# Now load the Ruby EMR Client +$LOAD_PATH << config["aws"]["emr_client_path"] +require 'amazon/coral/elasticmapreduceclient' +require 'amazon/retry_delegator' + +aws_config = { + :endpoint => "https://elasticmapreduce.amazonaws.com", + :ca_file => File.join(config["aws"]["emr_client_path"], "cacert.pem"), + :aws_access_key => config["aws"]["my_access_id"], + :aws_secret_key => config["aws"]["my_secret_key"], + :signature_algorithm => :V2 +} +client = Amazon::Coral::ElasticMapReduceClient.new_aws_query(aws_config) + +# Use the retry delegator to make your client retry if it gets connection failures. +is_retryable_error_response = Proc.new do |response| + if response == nil then + false + else + ret = false + if response['Error'] then + # don't retry on 'Timeout' because the call might have succeeded + ret ||= ['InternalFailure', 'Throttling', 'ServiceUnavailable'].include?(response['Error']['Code']) + end + ret + end +end + +client = Amazon::RetryDelegator.new(client, :retry_if => is_retryable_error_response) + +# Debug TODO: remove +puts client.DescribeJobFlows.inspect + # Runs a daily ETL job for the specific day. # Uses the Elastic MapReduce Command Line Tool. # Parameters: