Skip to content

Commit

Permalink
Merge branch 'master'
Browse files Browse the repository at this point in the history
  • Loading branch information
gsterjov committed Jan 24, 2025
2 parents 93454e2 + c568c26 commit aad59d5
Show file tree
Hide file tree
Showing 64 changed files with 1,292 additions and 214 deletions.
6 changes: 6 additions & 0 deletions dataSources/42bp/genomeArk/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,11 @@
},
"conversion": {
"mapID": 84855374
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/ala/avh/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,11 @@
},
"conversion": {
"mapID": 404635334
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
12 changes: 12 additions & 0 deletions dataSources/ala/lists/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"retrieveType": "script",
"download": {
"path": "./processing.py",
"function": "collect",
"args": [
"{OUTPATH}"
],
"output": "lists.csv"
},
"conversion": {}
}
58 changes: 58 additions & 0 deletions dataSources/ala/lists/processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from pathlib import Path
import pandas as pd
import requests

def collect(outputPath: Path) -> None:
baseURL = "https://lists-ws.test.ala.org.au/"
session = requests.Session()
recordsPerPage = 100

def getURL(endpoint: str, params: dict, pageSize: int, page: int = 1) -> dict:
fields = dict(params)
fields["page"] = page
fields["pageSize"] = pageSize

url = f"{baseURL}{endpoint}?" + "&".join(f"{k}={v}" for k, v in fields.items())
response = session.get(url)
data = response.json()
return data

listsMetadata = outputPath.parent / "metadata.csv"
if not listsMetadata.exists():
records = []
metadataEndpoint = "speciesList/"

query = {"tag": "arga"}
data = getURL(metadataEndpoint, query, recordsPerPage)
records.extend(data["lists"])
totalItems = data["listCount"]
remainingCalls = ((totalItems / recordsPerPage).__ceil__()) - 1

for call, _ in enumerate(range(remainingCalls), start=2):
data = getURL(metadataEndpoint, query, recordsPerPage, call)
records.extend(data["lists"])

df = pd.DataFrame.from_records(records)
df = df.drop(["description"], axis=1)
df.to_csv(listsMetadata, index=False)
else:
df = pd.read_csv(listsMetadata)

records = []
for id in df["id"]:
page = 1
while True:
print(f"Getting page #{page} for id {id}", end="\r")
data = getURL(f"speciesListItems/{id}", {}, recordsPerPage, page)
if not data:
break

records.extend(data)
page += 1

print()

df2 = pd.DataFrame.from_records(records)
df = df.rename(columns={"id": "speciesListID", "version": "speciesListVersion"})
df = df.merge(df2, "outer", on="speciesListID")
df2.to_csv(outputPath, index=False)
20 changes: 13 additions & 7 deletions dataSources/ala/profiles/config.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"retrieveType": "script",
"subsections": [
"kamilaroi",
"noongar",
"southeastarnhemland",
"mangrovewatch",
"weeds-australia"
],
"subsections": {
"kamilaroi": {},
"noongar": {},
"southeastarnhemland": {},
"mangrovewatch": {},
"weeds-australia": {}
},
"download": {
"path": "sourceProcessing/ala.py",
"function": "collect",
Expand All @@ -16,5 +16,11 @@
"./token.json"
],
"output": "{SUBSECTION}.csv"
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/algaeBase/api/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,11 @@
"./apiKey.txt"
],
"output": "algaeBase.csv"
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/anemone/db/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,11 @@
"function": "dwcAugment"
}
]
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/bold/austsv/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,11 @@
},
"conversion": {
"mapID": 78385490
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/bold/ausxml/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,11 @@
},
"conversion": {
"mapID": 984983691
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/bold/datapackage/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,11 @@
},
"conversion": {
"mapID": 1154592624
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/bpa/portal/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,11 @@
},
"conversion": {
"mapID": 1982878906
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/bvbrc/db/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,11 @@
},
"conversion": {
"mapID": 685936034
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
41 changes: 41 additions & 0 deletions dataSources/col/db/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"retrieveType": "url",
"datasetID": "ARGA:TL:0001018",
"download": {
"files": [
{
"url": "https://api.checklistbank.org/dataset/304708/export.zip?extended=true&format=DwCA",
"name": "catalogueOfLife.zip"
}
]
},
"processing": {
"final": [
{
"path": ".../tools/zipping.py",
"function": "extract",
"args": [
"{INPATH}",
"{OUTDIR}"
],
"output": "{INSTEM}"
},
{
"path": "./processing.py",
"function": "process",
"args": [
"{INPATH}",
"{OUTPATH}"
],
"output": "col.csv"
}
]
},
"conversion": {},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
29 changes: 29 additions & 0 deletions dataSources/col/db/processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from pathlib import Path
import pandas as pd

def process(folderPath: Path, outputPath: Path) -> None:

def readCSV(fileName: str) -> pd.DataFrame:
return pd.read_csv(folderPath / fileName, sep="\t", on_bad_lines="skip", low_memory=False)

df = readCSV("Taxon.tsv")

speciesProfile = readCSV("SpeciesProfile.tsv")
df = df.merge(speciesProfile, "left", "dwc:taxonID")

vernacularNames = readCSV("VernacularName.tsv")
records = {}
for _, row in vernacularNames.iterrows():
taxID = row["dwc:taxonID"]
if taxID not in records:
records[taxID] = {}

language = row["dcterms:language"]
if language not in records[taxID]:
records[taxID][language] = []

records[taxID][language].append(row["dwc:vernacularName"])

vernacular = pd.DataFrame.from_dict(records, orient="index")
df = df.merge(vernacular, "left", left_on="dwc:taxonID", right_on=vernacular.index)
df.to_csv(outputPath, index=False)
6 changes: 6 additions & 0 deletions dataSources/csiro/api/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,11 @@
},
"conversion": {
"mapID": 215504073
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/csiro/dap/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,11 @@
},
"conversion": {
"mapID": 16336602
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/dnazoo/db/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,11 @@
},
"conversion": {
"mapID": 570069681
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/ena/assembly/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,11 @@
},
"conversion": {
"mapID": 1058330275
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/ena/genome/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,11 @@
},
"conversion": {
"mapID": 1058330275
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/ena/taxonomy/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,11 @@
},
"conversion": {
"mapID": 1058330275
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/ena/variant/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,11 @@
},
"conversion": {
"mapID": 1058330275
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/esa178/db/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,11 @@
}
}
]
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
6 changes: 6 additions & 0 deletions dataSources/goat/db/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,11 @@
"output": "cleanedgoat.csv"
}
]
},
"update": {
"type": "weekly",
"day": "sunday",
"time": 9,
"repeat": 2
}
}
Loading

0 comments on commit aad59d5

Please sign in to comment.