-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexperimental_get_all_media_ids.py
126 lines (110 loc) · 4.47 KB
/
experimental_get_all_media_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Try to gather all the media items
# this exists because the `resource` API only list max 10 media IDs and the WebUI export doesnt' contain IDs as of 2023-05-30
# scrapes the Web UI, media item table and outputs a CSV
# Proof-of-concept only
from getpass import getpass
from time import sleep
import argparse
from bs4 import BeautifulSoup
import csv
import json
import logging
import requests
import os
import sys
import traceback
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from aviary import api as aviaryApi
from aviary import utilities as aviaryUtilities
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--server', required=True, help='Servername.')
parser.add_argument('--output', required=True, help='Location to store CSV output file.')
parser.add_argument('--session_cookie', required=True, help='The session cookie from an active .')
parser.add_argument('--wait', required=False, help='Time to wait between API calls.', default=1.0)
parser.add_argument('--logging_level', required=False, help='Logging level.', default=logging.WARNING)
return parser.parse_args()
def process(args, session, headers, report_csv):
more_pages = True
start = 0
length = 100
while more_pages is True:
try:
# `order` is required along with start and length -- unsuer about the others
data = {
# "draw": 2,
# "columns[0][data]": 0,
# "columns[0][name]": "",
# "columns[0][searchable]": "true",
# "columns[0][orderable]": "true",
# "columns[0][search][value]": "true",
# "columns[0][search][regex]": "true",
"order[0][column]": 1,
"order[0][dir]": "asc",
"start": start,
"length": length,
"search[value]": "",
"search[regex]": "false",
"called_from": ""
}
logging.info(data)
response = session.post(
args.server + "collection_resource_files/data_table.json",
headers=headers,
data=data
)
response.raise_for_status()
start = start + length
print(f" {start} ", end="", flush=True)
sleep(args.wait)
logging.debug(response)
response_json = json.loads(response.content)
logging.debug(response_json)
for item in response_json['data']:
logging.info(item)
report_csv.writerow({
"media_id": item[1],
"collection_title": item[6],
"resource_id": item[7],
"resource_title": item[8],
})
if (start + length > response_json['recordsFiltered']):
logging.info(f"Total: {response_json['recordsFiltered']}")
more_pages = False
except BaseException as e:
more_pages = False
print(f"{e}")
traceback.print_exc()
print(f"\n")
def build_session_header(args, input_file, session):
# get the x-csrf-token
headers = {"cookie": input_file.read().rstrip("\n")}
response = session.get(
args.server + "collection_resource_files",
headers=headers,
)
response.raise_for_status()
soup = BeautifulSoup(response.content, features="lxml")
csrf_token = soup.find('meta', {'name': 'csrf-token'})['content']
headers = {
"cookie": headers['cookie'],
"x-csrf-token": csrf_token
}
logging.info(headers)
return headers
#
def main():
args = parse_args()
logging.getLogger().setLevel(args.logging_level)
# copy the cookie from an authenticated Aviary Web UI session (via the browser dev tools -> network)
# write to a file (this will be the input file)
# todo: replace with a python auth script that can handle the MFA request
with open(args.session_cookie, 'r', encoding="utf-8", newline='') as input_file:
session = requests.Session()
headers = build_session_header(args, input_file, session)
with open(args.output, 'wt', encoding="utf-8", newline='') as output_file:
report_csv = csv.DictWriter(output_file, fieldnames={"media_id", "collection_title", "resource_id", "resource_title"})
report_csv.writeheader()
process(args, session, headers, report_csv)
if __name__ == "__main__":
main()