1 """Partially through ENCODE3 the DCC switched to needing to upload via AWS
7 from pprint import pformat, pprint
16 from htsworkflow.submission.submission import Submission
17 from .encoded import ENCODED
18 from htsworkflow.util.rdfhelp import \
22 from htsworkflow.util.ucsc import bigWigInfo
24 from django.conf import settings
25 from django.template import Context, loader
27 LOGGER = logging.getLogger(__name__)
29 class AWSSubmission(Submission):
30 def __init__(self, name, model, encode_host, lims_host):
31 """Create a AWS based submission
34 - `name`: Name of submission
35 - `model`: librdf model reference
36 - `host`: hostname for library pages.
38 super(AWSSubmission, self).__init__(name, model, lims_host)
39 self.encode = ENCODED(encode_host)
40 self.encode.load_netrc()
42 def upload(self, results_map, dry_run=False):
43 for an_analysis in self.analysis_nodes(results_map):
44 for metadata in self.get_metadata(an_analysis):
45 metadata['@type'] = ['file']
46 self.encode.validate(metadata)
50 LOGGER.info(json.dumps(metadata, indent=4, sort_keys=True))
53 upload = metadata['submitted_file_name'] + '.upload'
54 if not os.path.exists(upload):
55 with open(upload, 'w') as outstream:
56 outstream.write(json.dumps(metadata, indent=4, sort_keys=True))
57 LOGGER.debug(json.dumps(metadata, indent=4, sort_keys=True))
59 response = self.encode.post_json('/file', metadata)
60 LOGGER.info(json.dumps(response, indent=4, sort_keys=True))
62 item = response['@graph'][0]
63 creds = item['upload_credentials']
64 self.run_aws_cp(metadata['submitted_file_name'], creds)
66 LOGGER.info('%s already uploaded',
67 metadata['submitted_file_name'])
70 def run_aws_cp(self, pathname, creds):
71 env = os.environ.copy()
73 'AWS_ACCESS_KEY_ID': creds['access_key'],
74 'AWS_SECRET_ACCESS_KEY': creds['secret_key'],
75 'AWS_SECURITY_TOKEN': creds['session_token'],
79 subprocess.check_call(['aws', 's3', 'cp', pathname, creds['upload_url']], env=env)
80 except subprocess.CalledProcessError as e:
81 LOGGER.error('Upload of %s failed with exit code %d', pathname, e.returncode)
85 LOGGER.info('Upload of %s finished in %.2f seconds',
90 def get_metadata(self, analysis_node):
91 # convert our model names to encode project aliases
93 'Illumina HiSeq 2500': 'ENCODE:HiSeq2500'
95 query_template = loader.get_template('aws_metadata.sparql')
98 'submission': str(analysis_node.uri),
99 'submissionSet': str(self.submissionSetNS[''].uri),
101 results = self.execute_query(query_template, context)
102 LOGGER.info("scanned %s for results found %s",
103 str(analysis_node), len(results))
105 # need to adjust the results of the query slightly.
107 if 'platform' in row:
108 row['platform'] = platform_alias[row['platform']]
109 flowcell_details = {}
110 for term in ['machine', 'flowcell', 'lane', 'barcode']:
112 value = str(row[term])
113 flowcell_details[term] = value
115 if len(flowcell_details) > 0:
116 row['flowcell_details'] = [flowcell_details]