-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_scraping.py
95 lines (69 loc) · 2.51 KB
/
data_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
##########################################################
## APS Database Server ##
## ##
## Authors: [email protected], ##
## Date: November 3, 2022 ##
## Version: 1 ##
##########################################################
import os
from pathlib import Path
class DataScraping():
"""
The DataScraping class scrapes the data folder defined in the passed config file (data_dir=..)
and returns a list of files but also empty folders folders. Files in the config file's exclude
keyword (exclude=..) are ignored.
Attributes
----------
conf : ConfigObj
A configuration file object. The library configobj2 was used in v1
"""
def __init__(self, conf):
self.conf = conf
self.ddir = self.conf['Data']['data_dir']
self.excluded_files = self.conf['Data']['excluded']
def shouldbe_listed_path(self, path):
"""
Based on the Path (pathlib) of a file/directory this function checks whether or not to display it on the website.
Standard behavior: (empty folders + files) minus the files in self.excluded_files are displayed
"""
try:
if path.name in self.excluded_files: #filename matches any pattern in excluded files?
return False
if os.path.isdir(path) and os.path.exists(path) and len(os.listdir(path)) > 0: #directory with content? - will be covered when files are listed
return False
if path.is_file():
return True
except Exception as e:
print("Exception: ", e)
pass
return True
def scandir(self):
"""
Returns a generator object of paths to be listed on the website.
"""
for p in Path(self.ddir).rglob("*"):
if not self.shouldbe_listed_path(p):
continue
yield p
def search_results(self, sterms):
"""
Filter the results returned by 'scandir' for keywords given in sterms.
Only if all keywords are found in a path it is returned.
Parameters
----------
sterms: list
A list (can be empty) of search terms (str) that all should be included in the path.
"""
dirs = self.scandir() #generator object
dirs = list(dirs)
dirs = [str(d) for d in dirs]
if len(sterms) == 0: #no search term - return all data
return list(dirs)
#make all search terms lowercase
sterms = [s.lower() for s in sterms]
#make all paths lowercase but keep the original also
dirs = [[d.lower(), d] for d in dirs]
#print(dirs)
dirs = [dirname[1] for dirname in dirs if all(substring in dirname[0] for substring in sterms)] #finds all entries that includes ALL search terms
#print('found: ', dirs)
return dirs