-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki-movie.py
80 lines (66 loc) · 2.67 KB
/
wiki-movie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
__author__ = "Param Popat"
__version__ = "1"
__git__ = "https://github.com/parampopat/"
"""
import os
import pandas as pd
import wikipedia
from wikipedia.exceptions import DisambiguationError, PageError
def legalize(name, illegal):
"""
Removes illegal filename characters
:param name: File Name
:param illegal: List of illegal Characters
:return: Legalized File Name
"""
for char in name:
if char in illegal:
name = name.replace(char, "")
return name
if __name__ == '__main__':
# Define the task. For Example "Plot"
task = "Plot"
author = "_James_Cameron"
csvname = "jamescameron.csv"
# Wikipedia header format
header = "== " + task + " =="
# List of illegal File Name Characters
illegalCharacters = [':', '%', '/', '\\', '.', '>', '<', '?', '|', '*', '"', ]
# Path to save task files.
path = "F:\\PARAM\\Wiki-Movies\\" + task + author + "\\"
# Check if folder exists otherwise create it.
if not os.path.exists(path):
os.mkdir(path)
# Open the CSV containing names of movies
listOfMovies = pd.read_csv(path.replace(task + author + "\\", "") + csvname).iloc[:, 0].values
listOfFiles = []
# For each movie, extract plot and save it in text file.
for movieName in listOfMovies:
try:
text = wikipedia.page(movieName).content.replace("\n", "")
except DisambiguationError:
text = wikipedia.page(movieName + " (film)").content.replace("\n", "")
# If page doesnt have the task header, try adding (film) to the movie name.
if header not in text:
try:
text = wikipedia.page(movieName + " (film)").content.replace("\n", "")
except PageError:
# If page doesn't exist
print(movieName, 'NA')
continue
# Select text from after the task header till the start of next header.
plot = text[
text.find(header) + 10:(
text[(text.find(header) + 10):].find("==") + text.find(header) + 10)]
# Check and solve for illegal characters in movie name.
fileName = legalize(movieName, illegalCharacters)
# Keep a track of filenames`
listOfFiles.append([movieName, fileName])
# Save the task content in text files.
with open(path + fileName + ".txt", 'w', encoding="UTF-8") as f:
f.write(plot)
f.close()
# Save the movie name to file name association for future reference.
assoc = pd.DataFrame(listOfFiles, columns=['Movie Name', 'File Name'])
assoc.to_csv(path.replace(task + author + "\\", "") + task + author + '_associations.csv', index=None)