Skip to content

Commit

Permalink
repaired download
Browse files Browse the repository at this point in the history
  • Loading branch information
janekg89 committed Oct 1, 2020
1 parent d71b494 commit de0d858
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 80 deletions.
1 change: 0 additions & 1 deletion backend/pkdb_app/behaviours.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def study_sid(self):
def map_field(fields):
return [f"{field}_map" for field in fields]

VALUE_FIELDS_SAME_SCALE = ["value", "mean", "median", "min", "max"]
VALUE_FIELDS_SAME_SCALE = ["value", "mean", "median", "min", "max"]
VALUE_FIELDS_NO_UNIT = VALUE_FIELDS_SAME_SCALE + ["sd", "se", "cv"]
VALUE_FIELDS = VALUE_FIELDS_NO_UNIT + ["unit"]
Expand Down
2 changes: 1 addition & 1 deletion backend/pkdb_app/interventions/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# ------------------------------------
@registry.register_document
class InterventionDocument(Document):
pk = fields.IntegerField()
pk = fields.IntegerField("pk")
measurement_type = info_node("i_measurement_type")
form = info_node("i_form")
route = info_node("i_route")
Expand Down
56 changes: 34 additions & 22 deletions backend/pkdb_app/interventions/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
"""
import itertools

from django.apps import apps
from rest_framework import serializers

from pkdb_app import utils
from pkdb_app.behaviours import VALUE_FIELDS_NO_UNIT, \
MEASUREMENTTYPE_FIELDS, map_field, EX_MEASUREMENTTYPE_FIELDS
from pkdb_app.info_nodes.models import InfoNode
from pkdb_app.info_nodes.serializers import MeasurementTypeableSerializer, EXMeasurementTypeableSerializer
from pkdb_app.info_nodes.serializers import MeasurementTypeableSerializer
from pkdb_app.subjects.serializers import EXTERN_FILE_FIELDS
from ..comments.serializers import DescriptionSerializer, CommentSerializer, DescriptionElasticSerializer, \
CommentElasticSerializer
Expand Down Expand Up @@ -297,36 +296,49 @@ class Meta:
fields = ["pk", "normed"] + INTERVENTION_FIELDS + ["study"] + MEASUREMENTTYPE_FIELDS


class InterventionElasticSerializerAnalysis(serializers.ModelSerializer):
class InterventionElasticSerializerAnalysis(serializers.Serializer):
study_sid = serializers.CharField()
study_name = serializers.CharField()
intervention_pk = serializers.IntegerField(source="pk")
substance = serializers.CharField(source="substance_name", allow_null=True)
measurement_type = serializers.CharField(source="measurement_type_name",)
raw_pk = serializers.IntegerField()
normed = serializers.BooleanField()

name = serializers.CharField()
route = serializers.CharField(source="route_name",)
application = serializers.CharField(source="application_name",)
form = serializers.CharField(source="form_name",)
application = serializers.CharField(source="application_name",)
time = serializers.FloatField()
time_end = serializers.FloatField()
time_unit = serializers.CharField()
measurement_type = serializers.CharField(source="measurement_type_name",)
choice = serializers.CharField(source="choice_name")
value = serializers.FloatField(allow_null=True)
mean = serializers.FloatField(allow_null=True)
median = serializers.FloatField(allow_null=True)
min = serializers.FloatField(allow_null=True)
max = serializers.FloatField(allow_null=True)
sd = serializers.FloatField(allow_null=True)
se = serializers.FloatField(allow_null=True)
cv = serializers.FloatField(allow_null=True)
substance = serializers.CharField(source="substance_name", )

value = serializers.FloatField()
mean = serializers.FloatField()
median = serializers.FloatField()
min = serializers.FloatField()
max = serializers.FloatField()
sd = serializers.FloatField()
se = serializers.FloatField()
cv = serializers.FloatField()
unit = serializers.CharField()

class Meta:
model = Intervention
fields = ["study_sid", "study_name", "intervention_pk", "raw_pk",
"normed"] + INTERVENTION_FIELDS + MEASUREMENTTYPE_FIELDS




"""
def to_representation(self, instance):
rep = super().to_representation(instance)
for field in VALUE_FIELDS_NO_UNIT + ["time"]:
try:
rep[field] = '{:.2e}'.format(rep[field])
except (ValueError, TypeError):
pass
return rep
rep = super().to_representation(instance)
for field in VALUE_FIELDS_NO_UNIT + ["time"]:
try:
rep[field] = '{:.2e}'.format(rep[field])
except (ValueError, TypeError):
pass
return rep
"""

41 changes: 35 additions & 6 deletions backend/pkdb_app/outputs/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@
from .models import (
Output,
OutputSet,
OutputEx,
OutputIntervention)
OutputEx)
from ..comments.serializers import DescriptionSerializer, CommentSerializer, DescriptionElasticSerializer, \
CommentElasticSerializer
from ..interventions.models import Intervention
from ..serializers import (
ExSerializer, StudySmallElasticSerializer, SidNameLabelSerializer, SidNameSerializer)
ExSerializer, StudySmallElasticSerializer, SidNameLabelSerializer)
from ..subjects.models import Group, DataFile, Individual
from ..subjects.serializers import (
EXTERN_FILE_FIELDS, GroupSmallElasticSerializer, IndividualSmallElasticSerializer)
Expand Down Expand Up @@ -315,12 +314,42 @@ def get_outputs(self, obj):
return list_of_pk("outputs", obj)


class OutputInterventionSerializer(serializers.ModelSerializer):
class OutputInterventionSerializer(serializers.Serializer):
study_sid = serializers.CharField()
study_name = serializers.CharField()
output_pk = serializers.IntegerField()
intervention_pk = serializers.IntegerField()
group_pk = serializers.IntegerField()
individual_pk = serializers.IntegerField()
normed = serializers.BooleanField()
calculated = serializers.BooleanField()

tissue = serializers.CharField()
method = serializers.CharField()
label = serializers.CharField()
output_type = serializers.CharField()

time = serializers.FloatField()
time_unit = serializers.CharField()

measurement_type =serializers.CharField()
choice = serializers.CharField()
substance =serializers.CharField()

value = serializers.FloatField()
mean = serializers.FloatField()
median = serializers.FloatField()
min = serializers.FloatField()
max = serializers.FloatField()
sd = serializers.FloatField()
se = serializers.FloatField()
cv = serializers.FloatField()
unit = serializers.CharField()


class Meta:
model = OutputIntervention
fields = ["study_sid", "study_name", "output_pk", "intervention_pk", "group_pk", "individual_pk", "normed",
"calculated"] + OUTPUT_FIELDS + MEASUREMENTTYPE_FIELDS
read_only_fields = fields


class SmallOutputSerializer(serializers.ModelSerializer):
Expand Down
1 change: 0 additions & 1 deletion backend/pkdb_app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,6 @@ def validate_dict(dic):
"detail": dic}
)


class StudySmallElasticSerializer(serializers.ModelSerializer):
class Meta:
model = Study
Expand Down
48 changes: 26 additions & 22 deletions backend/pkdb_app/studies/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
DescriptionElasticSerializer
from ..interventions.models import DataFile, InterventionSet
from ..interventions.serializers import InterventionSetSerializer, InterventionSetElasticSmallSerializer
from ..serializers import WrongKeyValidationSerializer, SidSerializer, StudySmallElasticSerializer, SidNameLabelSerializer
from ..serializers import WrongKeyValidationSerializer, SidSerializer, StudySmallElasticSerializer, \
SidNameLabelSerializer
from ..subjects.models import GroupSet, IndividualSet
from ..subjects.serializers import GroupSetSerializer, IndividualSetSerializer, DataFileElasticSerializer, \
GroupSetElasticSmallSerializer, IndividualSetElasticSmallSerializer
Expand Down Expand Up @@ -526,7 +527,7 @@ class StudyElasticSerializer(serializers.ModelSerializer):

substances = SidNameLabelSerializer(many=True, )

files = serializers.SerializerMethodField() # DataFileElasticSerializer(many=True, )
files = serializers.SerializerMethodField()

comments = CommentElasticSerializer(many=True, )
descriptions = DescriptionElasticSerializer(many=True, )
Expand Down Expand Up @@ -594,21 +595,39 @@ def get_files(self, obj):
else:
return []

class StudyAnalysisSerializer(serializers.ModelSerializer):
class StudyAnalysisSerializer(serializers.Serializer):
sid = serializers.CharField()
name= serializers.CharField()
licence = serializers.CharField()
access = serializers.CharField()
date = serializers.DateField()

creator = serializers.SerializerMethodField()
curators = serializers.SerializerMethodField()
substances = serializers.SerializerMethodField()

reference_pmid = serializers.SerializerMethodField()
reference_title = serializers.SerializerMethodField()
creator = serializers.SerializerMethodField()
curators = serializers.SerializerMethodField()
reference_date = serializers.DateField()


class Meta:
model = Study

def get_substances(self, obj):
return [s["label"] for s in obj.substances]

def get_reference_pmid(self, obj):
return obj.reference["pmid"]

def get_reference_title(self, obj):
return obj.reference["title"]

def get_creator(self, obj):
return obj.creator["username"]

def get_curators(self, obj):
return [s["username"] for s in obj.curators]

class Meta:
fields = [
"sid",
"name",
Expand All @@ -624,18 +643,3 @@ class Meta:
]

read_only_fields = fields

def get_substances(self, obj):
return [s["label"] for s in obj.substances]

def get_reference_pmid(self, obj):
return obj.reference["pmid"]

def get_reference_title(self, obj):
return obj.reference["title"]

def get_creator(self, obj):
return obj.creator["username"]

def get_curators(self, obj):
return [s["username"] for s in obj.curators]
54 changes: 33 additions & 21 deletions backend/pkdb_app/studies/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import tempfile
import uuid
import zipfile
from collections import namedtuple
from collections import namedtuple, OrderedDict
from datetime import datetime
from io import StringIO
from typing import Dict
Expand Down Expand Up @@ -569,11 +569,19 @@ def _pks(self, view_class: DocumentViewSet, query_dict: Dict, pk_field: str="pk"
response = queryset.source([pk_field]).params(size=scan_size).scan()
return [instance[pk_field] for instance in response]

def data_by_query_dict(self,query_dict, viewset, serializer):
def data_by_query_dict(self,query_dict, viewset, serializer, boost):
view = viewset(request=self.request)
queryset = view.filter_queryset(view.get_queryset())
queryset = queryset.filter("terms",**query_dict).source(serializer.Meta.fields)
return [hit.to_dict() for hit in queryset.params(size=10000).scan()]
if boost:
queryset=queryset.filter("terms", **query_dict).source(serializer.Meta.fields)
return [hit.to_dict() for hit in queryset.params(size=5000).scan()]

else:
queryset = queryset.filter("terms", **query_dict)

return serializer(queryset.params(size=5000).scan(), many=True).data




class ResponseSerializer(serializers.Serializer):
Expand Down Expand Up @@ -717,15 +725,15 @@ def serialize_scatter(ids):
scatter_subsets = SubSet.objects.filter(id__in=ids).prefetch_related('data_points')
return [t.scatter_representation() for t in scatter_subsets]

Sheet = namedtuple("Sheet", ["sheet_name", "query_dict", "viewset", "serializer", "function"])
Sheet = namedtuple("Sheet", ["sheet_name", "query_dict", "viewset", "serializer", "function", "boost_performance",])
table_content = {
"studies": Sheet("Studies", {"pk": pkdata.ids["studies"]}, ElasticStudyViewSet, StudyAnalysisSerializer, None),
"groups": Sheet("Groups", {"group_pk": pkdata.ids["groups"]}, GroupCharacteristicaViewSet, GroupCharacteristicaSerializer, None),
"individuals": Sheet("Individuals", {"individual_pk": pkdata.ids["individuals"]}, IndividualCharacteristicaViewSet,IndividualCharacteristicaSerializer, None),
"interventions": Sheet("Interventions", {"pk": pkdata.ids["interventions"]} ,ElasticInterventionAnalysisViewSet, InterventionElasticSerializerAnalysis, None),
"outputs": Sheet("Outputs", {"output_pk": pkdata.ids["outputs"]}, OutputInterventionViewSet, OutputInterventionSerializer, None),
"studies": Sheet("Studies", {"pk": pkdata.ids["studies"]}, ElasticStudyViewSet, StudyAnalysisSerializer, None, False),
"groups": Sheet("Groups", {"group_pk": pkdata.ids["groups"]}, GroupCharacteristicaViewSet, GroupCharacteristicaSerializer, None, True,),
"individuals": Sheet("Individuals", {"individual_pk": pkdata.ids["individuals"]}, IndividualCharacteristicaViewSet,IndividualCharacteristicaSerializer, None, True),
"interventions": Sheet("Interventions", {"pk": pkdata.ids["interventions"]} ,ElasticInterventionAnalysisViewSet, InterventionElasticSerializerAnalysis, None, False),
"outputs": Sheet("Outputs", {"output_pk": pkdata.ids["outputs"]}, OutputInterventionViewSet, OutputInterventionSerializer,None, True),
#"timecourses": Sheet("Timecourses", {"subset_pk": pkdata.ids["timecourses"]}, None, None, serialize_timecourses),
"scatters": Sheet("Scatter", {"subset_pk": pkdata.ids["scatters"]}, None, None, serialize_scatter),
"scatters": Sheet("Scatter", {"subset_pk": pkdata.ids["scatters"]}, None, None, serialize_scatter, None),
}


Expand All @@ -739,15 +747,20 @@ def serialize_scatter(ids):
string_buffer = StringIO()
if sheet.function:
df = pd.DataFrame(sheet.function(sheet.query_dict["subset_pk"]))
df.to_csv(string_buffer)
archive.writestr(f'{key}.csv', string_buffer.getvalue())
download_times[key] = time.time() - download_time_start

else:
data = pkdata.data_by_query_dict(sheet.query_dict,sheet.viewset,sheet.serializer)
df = pd.DataFrame(data)
def sorted_tuple(v):
return sorted(tuple(v))

if key=="outputs":

data = pkdata.data_by_query_dict(sheet.query_dict,sheet.viewset,sheet.serializer, sheet.boost_performance)
df = pd.DataFrame(data)[sheet.serializer.Meta.fields]
df.to_csv(string_buffer)
archive.writestr(f'{key}.csv', string_buffer.getvalue())
download_times[key] = time.time() - download_time_start
if key == "outputs":
download_time_start_timecourse = time.time()
def sorted_tuple(v):
return sorted(tuple(v))
timecourse_df = df[df["output_type"] == Output.OutputTypes.Timecourse]
if len(timecourse_df) !=0:
timecourse_df = pd.pivot_table(data=timecourse_df,index=["output_pk"], aggfunc=sorted_tuple).apply(SubSet.to_list)
Expand All @@ -756,10 +769,9 @@ def sorted_tuple(v):
timecourse_df = pd.DataFrame([])
timecourse_df.to_csv(string_buffer)
archive.writestr(f'timecourse.csv', string_buffer.getvalue())
download_times["timecourse"] = time.time()-download_time_start_timecourse


df.to_csv(string_buffer)
archive.writestr(f'{key}.csv', string_buffer.getvalue())
download_times[key] = time.time()-download_time_start
archive.write('download_extra/README.md', 'README.md')
archive.write('download_extra/TERMS_OF_USE.md', 'TERMS_OF_USE.md')

Expand Down
Loading

0 comments on commit de0d858

Please sign in to comment.