-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate-report.sh
executable file
·187 lines (133 loc) · 6.77 KB
/
generate-report.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/bin/bash
#
# Script to generate TPT GloBI report and upload to https://file.io .
#
# The report summarizes how many TPT records are indexed by GloBI.
#
# Also see https://parasitetracker.org, https://globalbioticinteractions.org, and https://github.com/globalbioticinteractions/globalbioticinteractions/issues/453 .
#
# Prerequisites: Java and https://github.com/globalbioticinteractions/elton .
#
#
TODAY=$(date "+%Y-%m-%d")
REPORT_DIR="$PWD/output/$TODAY"
REPORT_ARCHIVE="$PWD/output/tpt-globi-report-$TODAY.zip"
mkdir -p "$REPORT_DIR"
REVIEW_SUMMARY="$REPORT_DIR/review_summary.tsv"
REVIEW_BY_COLLECTION="$REPORT_DIR/review_summary_by_collection.tsv"
REVIEW="$REPORT_DIR/review_comments.tsv.gz"
INTERACTIONS_BY_COLLECTION="$REPORT_DIR/indexed_interactions_by_collection.tsv"
INTERACTIONS_FULL="$REPORT_DIR/indexed_interactions_full.tsv.gz"
INTERACTIONS_SIMPLE="$REPORT_DIR/indexed_interactions_simple.tsv.gz"
DATASET_INFO="$REPORT_DIR/datasets_under_review.tsv"
DATASET_LIST_FILE=${1:-datasets.tsv}
DATASETS_UNDER_REVIEW="$(cat $DATASET_LIST_FILE)"
DATASETS_UNDER_REVIEW_HEAD="$(head -n1 $DATASET_LIST_FILE)"
DATASETS_UNDER_REVIEW_TAIL="$(tail -n+2 $DATASET_LIST_FILE)"
ELTON_CMD="elton"
ELTON_VERSION=$($ELTON_CMD version)
set -x
echo "using elton version $ELTON_VERSION"
# updating TPT affiliated elton datasets
function updateAll {
# update all at once to reduce github api requests
for dataset in $DATASETS_UNDER_REVIEW
do
$ELTON_CMD update "$dataset"
done
}
# generating review reports
function reviewDatasets {
echo "${DATASETS_UNDER_REVIEW_HEAD}" | xargs -L1 $ELTON_CMD review --type info,note | gzip > "$REVIEW"
echo "${DATASETS_UNDER_REVIEW_TAIL}" | xargs -L1 $ELTON_CMD review --no-header --type info,note | tail -n +2 | gzip >> "$REVIEW"
echo -e "\n"
# group review issues by collection
echo -e "institutionCode\tcollectionId\tcollectionCode\tsourceCitation\tdistinctReviewCommentCount\ttype\tcomment" > "$REVIEW_BY_COLLECTION"
cat $REVIEW | gunzip | tail -n +2 | awk -F '\t' '{ print $9 "\t" $11 "\t" $10 "\t" $14 "\t" $5 "\t" $6 }' | sort | uniq -c | sort -nr | sed -E $'s/[ ]*//;s/[ ]/\t/' | awk -F '\t' '{ print $2 "\t" $3 "\t" $4 "\t" $5 "\t" $1 "\t" $6 "\t" $7 }' | sed -E $'s/\tnote\t/\tissue\t/g' | sort >> "$REVIEW_BY_COLLECTION"
# review summary
echo -e "distinctReviewCommentCount\ttype\tcomment" > "$REVIEW_SUMMARY"
cat $REVIEW | gunzip | tail -n +2 | awk -F '\t' '{ print $5 "\t" $6 }' | sort | uniq -c | sort -nr | sed -E $'s/[ ]*//;s/[ ]/\t/' | sed -E $'s/\tnote\t/\tissue\t/g' | sort >> "$REVIEW_SUMMARY"
}
function indexInteractionData {
# generating interaction data
echo "$DATASETS_UNDER_REVIEW_HEAD" | xargs -L1 $ELTON_CMD interactions | gzip > "$INTERACTIONS_FULL"
echo "$DATASETS_UNDER_REVIEW_TAIL" | xargs -L1 $ELTON_CMD interactions --no-header | gzip >> "$INTERACTIONS_FULL"
# group interaction data by collection
echo -e "institutionCode\tcollectionId\tcollectionCode\tsourceCitation\tindexedInteractionRecordCount\tinteractionTypeName\tinteractionTypeId" > "$INTERACTIONS_BY_COLLECTION"
cat "$INTERACTIONS_FULL" | gunzip | tail -n +2 | awk -F '\t' '{ print $6 "\t" $5 "\t" $4 "\t" $49 "\t" $20 "\t" $19 }' | sort | uniq -c | sort -nr | sed -E $'s/[ ]*//;s/[ ]/\t/' | awk -F '\t' '{ print $2 "\t" $3 "\t" $4 "\t" $5 "\t" $1 "\t" $6 "\t" $7 }' | sort >> "$INTERACTIONS_BY_COLLECTION"
cat "$INTERACTIONS_FULL" | gunzip | awk -F '\t' '{ print $6 "\t" $5 "\t" $4 "\t" $3 "\t" $8 "\t" $20 "\t" $27 }' | gzip > "$INTERACTIONS_SIMPLE"
}
function printReports() {
echo -e "\n---- distinct review comments by type ----"
cat "$REVIEW_SUMMARY"
echo -e "\n---- indexed interaction record count by institutionCode, collectionId, collectionCode, and interaction type ----"
cat "$INTERACTIONS_BY_COLLECTION"
echo -e "\n---- distinct review comment count by institution, collection and review comment type ----"
cat "$REVIEW_BY_COLLECTION"
echo "$DATASETS_UNDER_REVIEW" | xargs $ELTON_CMD datasets > "$DATASET_INFO"
DATASET_REFERENCES=$(cat $DATASET_INFO | cut -f2,3,4 | tail -n +2 | sed -E 's/\t/ accessed via /' | sed -E 's/^/ - /g' | sed -E 's/\t/ on /')
}
function generateReadme {
cat <<EOF > "$REPORT_DIR/README"
GloBI Data Review Report
Datasets under review:
$DATASET_REFERENCES
Generated on:
$TODAY
by:
GloBI's Elton $ELTON_VERSION
(see https://github.com/globalbioticinteractions/elton).
Note that all files ending with .tsv are files formatted
as UTF8 encoded tab-separated values files.
https://www.iana.org/assignments/media-types/text/tab-separated-values
Included in this review archive are:
README:
This file.
review_summary.tsv:
Summary across all reviewed collections of total number of distinct review comments.
review_summary_by_collection.tsv:
Summary by reviewed collection of total number of distinct review comments.
indexed_interactions_by_collection.tsv:
Summary of number of indexed interaction records by institutionCode and collectionCode.
review_comments.tsv.gz:
All review comments by collection.
indexed_interactions_full.tsv.gz:
All indexed interactions for all reviewed collections.
indexed_interactions_simple.tsv.gz:
All indexed interactions for all reviewed collections selecting only sourceInstitutionCode, sourceCollectionCode, sourceCatalogNumber, sourceTaxonName, interactionTypeName and targetTaxonName.
datasets_under_review.tsv:
Details on the datasets under review.
elton.jar:
Program used to update datasets and generate the review reports and associated indexed interactions.
EOF
}
echo -e "\nFor more information, see $REPORT_DIR"
function publishReview {
OLD_DIR=$PWD
cd "$REPORT_DIR"
zip "$REPORT_ARCHIVE" README review_summary.tsv review_summary_by_collection.tsv review_comments.tsv.gz indexed_interactions_full.tsv.gz indexed_interactions_simple.tsv.gz datasets_under_review.tsv elton.jar
cd "$OLD_DIR"
echo -e "\nDownload the full report [$REPORT_ARCHIVE] using single-use, and expiring, file.io link at:"
curl -F "file=@$REPORT_ARCHIVE" https://file.io
}
updateAll
reviewDatasets
indexInteractionData
printReports
generateReadme
cp $(which $ELTON_CMD) $REPORT_DIR/elton.jar
NUMBER_OF_INTERACTIONS=$(cat "$INTERACTIONS_FULL" | gunzip | sort | uniq | wc -l)
if [ $NUMBER_OF_INTERACTIONS -gt 1 ]
then
if [ -z ${TRAVIS} ]
then
echo "Done generating report. See [${REPORT_DIR}] for results."
else
echo "Detected https://travis-ci.org environment, uploading report to https://file.io"
publishReview
fi
else
echo "Cannot create report because no interaction records were found. Please check log."
echo "Please check you have the latest elton installed. You are using Elton v[$ELTON_VERSION]. See https://github.com/globalbioticinteractions/elton#install for install instructions and https://github.com/globalbioticinteractions/elton/releases/latest for latest version )."
exit 1
fi