-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbild_collectdata.py
75 lines (60 loc) · 2.35 KB
/
bild_collectdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
__author__ = 'Esther'
import re
import urllib.request
import xml.etree.ElementTree as ET
################################################################################
### NLP part #
################################################################################
def find_compounds(text):
"""
Find compounds with a linking "-" via regular expression
"""
compound_pattern = re.compile(u"[A-ZÄÖÜa-zäöüß]+\-[A-ZÄÖÜa-zäöüß]+")
firsts = set()
seconds = set()
for item in re.findall(compound_pattern, text):
f, s = item.split("-")
firsts.add(f)
seconds.add(s)
return [firsts, seconds]
################################################################################
### data collection part #
################################################################################
def write_data(first_path, second_path, data):
"""
Write data to files in order to access word parts later
"""
with open(first_path, "r", encoding="utf8") as first_file:
firsts = set([line.strip() for line in first_file.readlines()])
with open(second_path, "r", encoding="utf8") as second_file:
seconds = set([line.strip() for line in second_file.readlines()])
first_file = open(first_path, "w", encoding="utf8")
second_file = open(second_path, "w", encoding="utf8")
for item in data[0]:
firsts.add(item)
for item in data[1]:
seconds.add(item)
for item in firsts:
first_file.write(item + "\n")
for item in seconds:
second_file.write(item + "\n")
first_file.close()
second_file.close()
return
def data_collect():
"""
Open RSS Feed, read link descriptions from last 100 posts, look for compounds,
write results to files
"""
feedurl = "http://www.bild.de/rss/feeds/bild-3409238,n=100,sort=1,view=rss2.bild.xml"
feedpage = urllib.request.urlopen(feedurl)
feedcontent = feedpage.read()
feedcontent = feedcontent.decode("utf-8")
data = ET.fromstring(feedcontent)[0]
content = ""
for child in data:
if child.tag == "item":
content += child[0].text + "\n"
comps = find_compounds(content)
write_data("./bild_data/1.txt", "./bild_data/2.txt", comps)
data_collect()