This repository has been archived by the owner on Mar 11, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
3_csv_detective_analysis.py
63 lines (50 loc) · 1.95 KB
/
3_csv_detective_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import dataset as dataset_lib
import click
import datetime
from os import listdir
import requests
import json
import subprocess
import urllib.request
import glob
import os
db = dataset_lib.connect("sqlite:///orchestrator.db")
@click.group()
def cli():
pass
def find_json_filenames(path, suffix=".json" ):
filenames = listdir(path)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
def save_tmp_resource(jsonfilepath):
with open(jsonfilepath) as jsonfile:
try:
webhookdata = json.load(jsonfile)
linkId = webhookdata['check']['linkId']
print(linkId)
r = requests.get("http://localhost:5010/"+linkId)
miniodata = r.json()
resource_name = miniodata['downloads'][0]['files'][0]['name']
resource_ext = miniodata['downloads'][0]['type']
resource_url = miniodata['downloads'][0]['url']
table = db["checks"]
existing = table.find_one(check_id=linkId)
resource_id = existing['resource_id']
dataset_id = existing['dataset_id']
print(resource_id+"--"+dataset_id)
if(resource_ext == 'csv'):
urllib.request.urlretrieve(resource_url, '/tmp/dataworkflow/'+dataset_id+'---'+resource_id+'.'+resource_ext)
except:
print("Error in download")
@cli.command()
def run():
files = glob.glob('/tmp/dataworkflow/*')
for f in files:
os.remove(f)
today = str(datetime.datetime.today()).split()[0]
jsonfiles = find_json_filenames("static/"+today+"/")
for jsonfile in jsonfiles:
save_tmp_resource("static/"+today+"/"+jsonfile)
subprocess.Popen("cd csv-detective-ml && python analyze_csv_cli.py /tmp/dataworkflow /srv/datamanufactory/data-workflow/csv-detective-results/ "+today+" --rb_ml_analysis=both", shell=True)
#search every json file folder date today
if __name__ == "__main__":
cli()