-
Notifications
You must be signed in to change notification settings - Fork 80
Issue 855 #900
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Issue 855 #900
Changes from all commits
be2186b
5188ef0
905b14b
93a8760
6c3d5a0
60e5276
2d7fff4
5db7e9e
90da22c
2db9c91
5e7d687
09c7586
688c250
d9c73ca
ef7b3ec
ba42a92
b13055a
f32d47b
fe9c860
a9085dd
f851b8d
b849eaf
b93d037
591e004
2382017
fdac7da
4da919c
b8b205f
51696a8
2c70328
9a5d14f
0924257
5de46d2
ee95fe4
736ad4e
32a6984
bb0cb32
78b8aea
d17f9a1
8281e3c
57df5c7
bb1dd25
f9a9e10
84a9ed4
67d8fb2
a09f97f
68bc8ae
46b84ef
3bca14e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
language: python | ||
sudo: false | ||
env: | ||
global: | ||
- PYTHON_VERSION=2.7 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,10 +85,11 @@ | |
from .base import QiitaObject | ||
from .logger import LogEntry | ||
from .sql_connection import SQLConnectionHandler | ||
from .exceptions import QiitaDBError, QiitaDBUnknownIDError | ||
from .exceptions import QiitaDBError, QiitaDBUnknownIDError, QiitaDBStatusError | ||
from .util import (exists_dynamic_table, insert_filepaths, convert_to_id, | ||
convert_from_id, purge_filepaths, get_filepath_id, | ||
get_mountpoint, move_filepaths_to_upload_folder) | ||
get_mountpoint, move_filepaths_to_upload_folder, | ||
infer_status) | ||
|
||
|
||
class BaseData(QiitaObject): | ||
|
@@ -615,6 +616,58 @@ def remove_filepath(self, fp): | |
# Delete the files, if they are not used anywhere | ||
purge_filepaths(conn_handler) | ||
|
||
def status(self, study): | ||
"""The status of the raw data within the given study | ||
|
||
Parameters | ||
---------- | ||
study : Study | ||
The study that is looking to the raw data status | ||
|
||
Returns | ||
------- | ||
str | ||
The status of the raw data | ||
|
||
Raises | ||
------ | ||
QiitaDBStatusError | ||
If the raw data does not belong to the passed study | ||
|
||
Notes | ||
----- | ||
Given that a raw data can be shared by multiple studies, we need to | ||
know under which context (study) we want to check the raw data status. | ||
The rationale is that a raw data object can contain data from multiple | ||
studies, so the raw data can have multiple status at the same time. | ||
We then check the processed data generated to infer the status of the | ||
raw data. | ||
""" | ||
if self._id not in study.raw_data(): | ||
raise QiitaDBStatusError( | ||
"The study %s does not have access to the raw data %s" | ||
% (study.id, self.id)) | ||
|
||
conn_handler = SQLConnectionHandler() | ||
sql = """SELECT processed_data_status | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not blocking, but these would be cleaner/easier to read if you used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! I changed all instances that I've added in this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, that's some nice shorthand to know! Thanks @squirrelo |
||
FROM qiita.processed_data_status pds | ||
JOIN qiita.processed_data pd | ||
USING (processed_data_status_id) | ||
JOIN qiita.preprocessed_processed_data ppd_pd | ||
USING (processed_data_id) | ||
JOIN qiita.prep_template_preprocessed_data pt_ppd | ||
USING (preprocessed_data_id) | ||
JOIN qiita.prep_template pt | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the prep template joins in here? If we just need to get up to the processed data, using raw_data, study_raw_data, and study_preprocessed_data would be easier and faster than prep template. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that I have to check the specific processed data that has been generated from this raw data, not any processed data. That path is only available through the prep template. |
||
USING (prep_template_id) | ||
JOIN qiita.raw_data rd | ||
USING (raw_data_id) | ||
JOIN qiita.study_processed_data spd | ||
USING (processed_data_id) | ||
WHERE pt.raw_data_id=%s AND spd.study_id=%s""" | ||
pd_statuses = conn_handler.execute_fetchall(sql, (self._id, study.id)) | ||
|
||
return infer_status(pd_statuses) | ||
|
||
|
||
class PreprocessedData(BaseData): | ||
r"""Object for dealing with preprocessed data | ||
|
@@ -995,6 +1048,34 @@ def processing_status(self, state): | |
"UPDATE qiita.{0} SET processing_status=%s WHERE " | ||
"preprocessed_data_id=%s".format(self._table), (state, self.id)) | ||
|
||
@property | ||
def status(self): | ||
"""The status of the preprocessed data | ||
|
||
Returns | ||
------- | ||
str | ||
The status of the preprocessed_data | ||
|
||
Notes | ||
----- | ||
The status of a preprocessed data is inferred by the status of the | ||
processed data generated from this preprocessed data. If no processed | ||
data has been generated with this preprocessed data; then the status | ||
is 'sandbox'. | ||
""" | ||
conn_handler = SQLConnectionHandler() | ||
sql = """SELECT processed_data_status | ||
FROM qiita.processed_data_status pds | ||
JOIN qiita.processed_data pd | ||
USING (processed_data_status_id) | ||
JOIN qiita.preprocessed_processed_data ppd_pd | ||
USING (processed_data_id) | ||
WHERE ppd_pd.preprocessed_data_id=%s""" | ||
pd_statuses = conn_handler.execute_fetchall(sql, (self._id,)) | ||
|
||
return infer_status(pd_statuses) | ||
|
||
|
||
class ProcessedData(BaseData): | ||
r"""Object for dealing with processed data | ||
|
@@ -1020,6 +1101,61 @@ class ProcessedData(BaseData): | |
_study_processed_table = "study_processed_data" | ||
_preprocessed_processed_table = "preprocessed_processed_data" | ||
|
||
@classmethod | ||
def get_by_status(cls, status): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we have this for processed data, we should have it for preprocessed and raw data as well to keep things consistent. Anything that has a status really should have this classmethod. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been thinking about this... I can do that, although that functionality has not been required for any part of the code right now; so I figured that it will be easier to open an issue for that and do not block more DB development with this PR: #1009 |
||
"""Returns id for all ProcessedData with given status | ||
|
||
Parameters | ||
---------- | ||
status : str | ||
Status to search for | ||
|
||
Returns | ||
------- | ||
list of int | ||
All the processed data ids that match the given status | ||
""" | ||
conn_handler = SQLConnectionHandler() | ||
sql = """SELECT processed_data_id FROM qiita.processed_data pd | ||
JOIN qiita.processed_data_status pds | ||
USING (processed_data_status_id) | ||
WHERE pds.processed_data_status=%s""" | ||
result = conn_handler.execute_fetchall(sql, (status,)) | ||
if result: | ||
pds = set(x[0] for x in result) | ||
else: | ||
pds = set() | ||
|
||
return pds | ||
|
||
@classmethod | ||
def get_by_status_grouped_by_study(cls, status): | ||
"""Returns id for all ProcessedData with given status grouped by study | ||
|
||
Parameters | ||
---------- | ||
status : str | ||
Status to search for | ||
|
||
Returns | ||
------- | ||
dict of list of int | ||
A dictionary keyed by study id in which the values are the | ||
processed data ids that belong to that study and match the given | ||
status | ||
""" | ||
conn_handler = SQLConnectionHandler() | ||
sql = """SELECT spd.study_id, | ||
array_agg(pd.processed_data_id ORDER BY pd.processed_data_id) | ||
FROM qiita.processed_data pd | ||
JOIN qiita.processed_data_status pds | ||
USING (processed_data_status_id) | ||
JOIN qiita.study_processed_data spd | ||
USING (processed_data_id) | ||
WHERE pds.processed_data_status = %s | ||
GROUP BY spd.study_id;""" | ||
return dict(conn_handler.execute_fetchall(sql, (status,))) | ||
|
||
@classmethod | ||
def create(cls, processed_params_table, processed_params_id, filepaths, | ||
preprocessed_data=None, study=None, processed_date=None, | ||
|
@@ -1173,3 +1309,40 @@ def processed_date(self): | |
return conn_handler.execute_fetchone( | ||
"SELECT processed_date FROM qiita.{0} WHERE " | ||
"processed_data_id=%s".format(self._table), (self.id,))[0] | ||
|
||
@property | ||
def status(self): | ||
conn_handler = SQLConnectionHandler() | ||
sql = """SELECT pds.processed_data_status | ||
FROM qiita.processed_data_status pds | ||
JOIN qiita.processed_data pd | ||
USING (processed_data_status_id) | ||
WHERE pd.processed_data_id=%s""" | ||
return conn_handler.execute_fetchone(sql, (self._id,))[0] | ||
|
||
@status.setter | ||
def status(self, status): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no sanity check for the status, so when I tested it with status 'chicken' it errors out. Better to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And by errors out I mean it says it would violate the NOT NULL constraint of the column. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. Fixed, note that this is the only status setter implemented on this PR (as all the other status are inferred from the processed data status) |
||
"""Set the status value | ||
|
||
Parameters | ||
---------- | ||
status : str | ||
The new status | ||
|
||
Raises | ||
------ | ||
QiitaDBStatusError | ||
If the processed data status is public | ||
""" | ||
if self.status == 'public': | ||
raise QiitaDBStatusError( | ||
"Illegal operation on public processed data") | ||
|
||
conn_handler = SQLConnectionHandler() | ||
|
||
status_id = convert_to_id(status, 'processed_data_status', | ||
conn_handler=conn_handler) | ||
|
||
sql = """UPDATE qiita.{0} SET processed_data_status_id = %s | ||
WHERE processed_data_id=%s""".format(self._table) | ||
conn_handler.execute(sql, (status_id, self._id)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,9 +81,8 @@ def get_accessible_filepath_ids(user): | |
return set(f[0] for f in fpids) | ||
|
||
# First, the studies | ||
# There are public, private, and shared studies | ||
study_ids = Study.get_by_status('public') | user.user_studies | \ | ||
user.shared_studies | ||
# There are private and shared studies | ||
study_ids = user.user_studies | user.shared_studies | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why remove the public from here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nevermind, see it below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😄 |
||
|
||
filepath_ids = set() | ||
for study_id in study_ids: | ||
|
@@ -107,7 +106,6 @@ def get_accessible_filepath_ids(user): | |
for rdid in study.raw_data(): | ||
for pt_id in RawData(rdid).prep_templates: | ||
# related to https://github.com/biocore/qiita/issues/596 | ||
# and https://github.com/biocore/qiita/issues/554 | ||
if PrepTemplate.exists(pt_id): | ||
for _id, _ in PrepTemplate(pt_id).get_filepaths(): | ||
prep_fp_ids.append(_id) | ||
|
@@ -118,6 +116,40 @@ def get_accessible_filepath_ids(user): | |
in SampleTemplate(study_id).get_filepaths()] | ||
filepath_ids.update(sample_fp_ids) | ||
|
||
# Next, the public processed data | ||
processed_data_ids = ProcessedData.get_by_status('public') | ||
for pd_id in processed_data_ids: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems amazingly expensive, especially if this is going to be used to check file access and needs to happen every download. Can you create a PrepTemplate(x).associated_files or some other way of getting these out with far less database calls. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm just following the pattern on this function. I already noticed that this function is quite bad and needs some refactoring, but I think it needs to be done outside of this PR. See #1010 |
||
processed_data = ProcessedData(pd_id) | ||
|
||
# Add the filepaths of the processed data | ||
pd_fps = (fpid for fpid, _, _ in processed_data.get_filepaths()) | ||
filepath_ids.update(pd_fps) | ||
|
||
# Each processed data has a preprocessed data | ||
ppd = PreprocessedData(processed_data.preprocessed_data) | ||
ppd_fps = (fpid for fpid, _, _ in ppd.get_filepaths()) | ||
filepath_ids.update(ppd_fps) | ||
|
||
# Each preprocessed data has a prep template | ||
pt_id = ppd.prep_template | ||
# related to https://github.com/biocore/qiita/issues/596 | ||
if PrepTemplate.exists(pt_id): | ||
pt = PrepTemplate(pt_id) | ||
pt_fps = (fpid for fpid, _ in pt.get_filepaths()) | ||
filepath_ids.update(pt_fps) | ||
|
||
# Each prep template has a raw data | ||
rd = RawData(pt.raw_data) | ||
rd_fps = (fpid for fpid, _, _ in rd.get_filepaths()) | ||
filepath_ids.update(rd_fps) | ||
|
||
# And each processed data has a study, which has a sample template | ||
st_id = processed_data.study | ||
if SampleTemplate.exists(st_id): | ||
sample_fp_ids = (_id for _id, _ | ||
in SampleTemplate(st_id).get_filepaths()) | ||
filepath_ids.update(sample_fp_ids) | ||
|
||
# Next, analyses | ||
# Same as before, there are public, private, and shared | ||
analysis_ids = Analysis.get_by_status('public') | user.private_analyses | \ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might want to remove this section as there are not specific exceptions raised in this method.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually forgot to add the Error raised 😲