Merge branch 'master'

ARGA-Genomes · Jan 24, 2025 · aad59d5 · aad59d5
2 parents 93454e2 + c568c26
commit aad59d5
Show file tree

Hide file tree

Showing 64 changed files with 1,292 additions and 214 deletions.
diff --git a/dataSources/42bp/genomeArk/config.json b/dataSources/42bp/genomeArk/config.json
@@ -12,5 +12,11 @@
     },
     "conversion": {
         "mapID": 84855374
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/ala/avh/config.json b/dataSources/ala/avh/config.json
@@ -11,5 +11,11 @@
     },
     "conversion": {
         "mapID": 404635334
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/ala/lists/config.json b/dataSources/ala/lists/config.json
@@ -0,0 +1,12 @@
+{
+    "retrieveType": "script",
+    "download": {
+        "path": "./processing.py",
+        "function": "collect",
+        "args": [
+            "{OUTPATH}"
+        ],
+        "output": "lists.csv"
+    },
+    "conversion": {}
+}
diff --git a/dataSources/ala/lists/processing.py b/dataSources/ala/lists/processing.py
@@ -0,0 +1,58 @@
+from pathlib import Path
+import pandas as pd
+import requests
+
+def collect(outputPath: Path) -> None:
+    baseURL = "https://lists-ws.test.ala.org.au/"
+    session = requests.Session()
+    recordsPerPage = 100
+
+    def getURL(endpoint: str, params: dict, pageSize: int, page: int = 1) -> dict:
+        fields = dict(params)
+        fields["page"] = page
+        fields["pageSize"] = pageSize
+
+        url = f"{baseURL}{endpoint}?" + "&".join(f"{k}={v}" for k, v in fields.items())
+        response = session.get(url)
+        data = response.json()
+        return data
+
+    listsMetadata = outputPath.parent / "metadata.csv"
+    if not listsMetadata.exists():
+        records = []
+        metadataEndpoint = "speciesList/"
+
+        query = {"tag": "arga"}
+        data = getURL(metadataEndpoint, query, recordsPerPage)
+        records.extend(data["lists"])
+        totalItems = data["listCount"]
+        remainingCalls = ((totalItems / recordsPerPage).__ceil__()) - 1
+
+        for call, _ in enumerate(range(remainingCalls), start=2):
+            data = getURL(metadataEndpoint, query, recordsPerPage, call)
+            records.extend(data["lists"])
+
+        df = pd.DataFrame.from_records(records)
+        df = df.drop(["description"], axis=1)
+        df.to_csv(listsMetadata, index=False)
+    else:
+        df = pd.read_csv(listsMetadata)
+
+    records = []
+    for id in df["id"]:
+        page = 1
+        while True:
+            print(f"Getting page #{page} for id {id}", end="\r")
+            data = getURL(f"speciesListItems/{id}", {}, recordsPerPage, page)
+            if not data:
+                break
+
+            records.extend(data)
+            page += 1
+
+        print()
+
+    df2 = pd.DataFrame.from_records(records)
+    df = df.rename(columns={"id": "speciesListID", "version": "speciesListVersion"})
+    df = df.merge(df2, "outer", on="speciesListID")
+    df2.to_csv(outputPath, index=False)
diff --git a/dataSources/ala/profiles/config.json b/dataSources/ala/profiles/config.json
@@ -1,12 +1,12 @@
 {
     "retrieveType": "script",
-    "subsections": [
-        "kamilaroi",
-        "noongar",
-        "southeastarnhemland",
-        "mangrovewatch",
-        "weeds-australia"
-    ],
+    "subsections": {
+        "kamilaroi": {},
+        "noongar": {},
+        "southeastarnhemland": {},
+        "mangrovewatch": {},
+        "weeds-australia": {}
+    },
     "download": {
         "path": "sourceProcessing/ala.py",
         "function": "collect",
@@ -16,5 +16,11 @@
             "./token.json"
         ],
         "output": "{SUBSECTION}.csv"
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/algaeBase/api/config.json b/dataSources/algaeBase/api/config.json
@@ -9,5 +9,11 @@
             "./apiKey.txt"
         ],
         "output": "algaeBase.csv"
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/anemone/db/config.json b/dataSources/anemone/db/config.json
@@ -30,5 +30,11 @@
                 "function": "dwcAugment"
             }
         ]
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/bold/austsv/config.json b/dataSources/bold/austsv/config.json
@@ -15,5 +15,11 @@
     },
     "conversion": {
         "mapID": 78385490
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/bold/ausxml/config.json b/dataSources/bold/ausxml/config.json
@@ -33,5 +33,11 @@
     },
     "conversion": {
         "mapID": 984983691
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/bold/datapackage/config.json b/dataSources/bold/datapackage/config.json
@@ -32,5 +32,11 @@
     },
     "conversion": {
         "mapID": 1154592624
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/bpa/portal/config.json b/dataSources/bpa/portal/config.json
@@ -12,5 +12,11 @@
     },
     "conversion": {
         "mapID": 1982878906
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/bvbrc/db/config.json b/dataSources/bvbrc/db/config.json
@@ -14,5 +14,11 @@
     },
     "conversion": {
         "mapID": 685936034
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/col/db/config.json b/dataSources/col/db/config.json
@@ -0,0 +1,41 @@
+{
+    "retrieveType": "url",
+    "datasetID": "ARGA:TL:0001018",
+    "download": {
+        "files": [
+            {
+                "url": "https://api.checklistbank.org/dataset/304708/export.zip?extended=true&format=DwCA",
+                "name": "catalogueOfLife.zip"
+            }
+        ]
+    },
+    "processing": {
+        "final": [
+            {
+                "path": ".../tools/zipping.py",
+                "function": "extract",
+                "args": [
+                    "{INPATH}",
+                    "{OUTDIR}"
+                ],
+                "output": "{INSTEM}"
+            },
+            {
+                "path": "./processing.py",
+                "function": "process",
+                "args": [
+                    "{INPATH}",
+                    "{OUTPATH}"
+                ],
+                "output": "col.csv"
+            }
+        ]
+    },
+    "conversion": {},
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
+    }
+}
diff --git a/dataSources/col/db/processing.py b/dataSources/col/db/processing.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+import pandas as pd
+
+def process(folderPath: Path, outputPath: Path) -> None:
+
+    def readCSV(fileName: str) -> pd.DataFrame:
+        return pd.read_csv(folderPath / fileName, sep="\t", on_bad_lines="skip", low_memory=False)
+
+    df = readCSV("Taxon.tsv")
+
+    speciesProfile = readCSV("SpeciesProfile.tsv")
+    df = df.merge(speciesProfile, "left", "dwc:taxonID")
+
+    vernacularNames = readCSV("VernacularName.tsv")
+    records = {}
+    for _, row in vernacularNames.iterrows():
+        taxID = row["dwc:taxonID"]
+        if taxID not in records:
+            records[taxID] = {}
+
+        language = row["dcterms:language"]
+        if language not in records[taxID]:
+            records[taxID][language] = []
+
+        records[taxID][language].append(row["dwc:vernacularName"])
+
+    vernacular = pd.DataFrame.from_dict(records, orient="index")
+    df = df.merge(vernacular, "left", left_on="dwc:taxonID", right_on=vernacular.index)
+    df.to_csv(outputPath, index=False)
diff --git a/dataSources/csiro/api/config.json b/dataSources/csiro/api/config.json
@@ -19,5 +19,11 @@
     },
     "conversion": {
         "mapID": 215504073
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/csiro/dap/config.json b/dataSources/csiro/dap/config.json
@@ -11,5 +11,11 @@
     },
     "conversion": {
         "mapID": 16336602
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/dnazoo/db/config.json b/dataSources/dnazoo/db/config.json
@@ -11,5 +11,11 @@
     },
     "conversion": {
         "mapID": 570069681
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/ena/assembly/config.json b/dataSources/ena/assembly/config.json
@@ -14,5 +14,11 @@
     },
     "conversion": {
         "mapID": 1058330275
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/ena/genome/config.json b/dataSources/ena/genome/config.json
@@ -33,5 +33,11 @@
     },
     "conversion": {
         "mapID": 1058330275
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/ena/taxonomy/config.json b/dataSources/ena/taxonomy/config.json
@@ -37,5 +37,11 @@
     },
     "conversion": {
         "mapID": 1058330275
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/ena/variant/config.json b/dataSources/ena/variant/config.json
@@ -24,5 +24,11 @@
     },
     "conversion": {
         "mapID": 1058330275
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/esa178/db/config.json b/dataSources/esa178/db/config.json
@@ -30,5 +30,11 @@
                 }
             }
         ]
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }
diff --git a/dataSources/goat/db/config.json b/dataSources/goat/db/config.json
@@ -21,5 +21,11 @@
                 "output": "cleanedgoat.csv"
             }
         ]
+    },
+    "update": {
+        "type": "weekly",
+        "day": "sunday",
+        "time": 9,
+        "repeat": 2
     }
 }