-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment.py
89 lines (75 loc) · 3.34 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import PyPDF2
from transformers import pipeline, AutoTokenizer
import matplotlib.pyplot as plt
import nltk
# Ensure you have the required NLTK data
nltk.download('punkt')
def read_pdf(file_path):
pdf_reader = PyPDF2.PdfReader(file_path)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
def split_into_sections(text, num_sections):
# Tokenize the text into sentences
sentences = nltk.sent_tokenize(text)
# Split sentences into roughly equal sections
section_size = len(sentences) // num_sections
sections = [sentences[i * section_size:(i + 1) * section_size] for i in range(num_sections)]
sections = [' '.join(section) for section in sections]
return sections
def split_into_chunks(text, max_length=512):
# Split text into chunks ensuring each chunk is within the model's max token limit
tokens = text.split()
chunks = []
chunk = []
chunk_length = 0
for token in tokens:
token_length = len(token)
if chunk_length + token_length + 1 > max_length: # +1 for space or special token
chunks.append(' '.join(chunk))
chunk = [token]
chunk_length = token_length
else:
chunk.append(token)
chunk_length += token_length + 1
if chunk:
chunks.append(' '.join(chunk))
return chunks
def analyze_sentiment(sections):
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
sentiment_scores = []
for section in sections:
chunks = split_into_chunks(section)
section_scores = []
for chunk in chunks:
result = sentiment_analyzer(chunk)
# Average the scores for the chunk
avg_score = sum([r['score'] if r['label'] == 'POSITIVE' else -r['score'] for r in result]) / len(result)
section_scores.append(avg_score)
# Average the scores for the section
section_avg_score = sum(section_scores) / len(section_scores)
sentiment_scores.append(section_avg_score)
return sentiment_scores
def visualize_sentiment(sentiment_scores, output_image_path):
sections = range(1, len(sentiment_scores) + 1)
plt.figure(figsize=(10, 5))
plt.plot(sections, sentiment_scores, marker='o')
plt.title('Sentiment Analysis of Project 2025\'s Giant Blob of Sinister by Section')
plt.xlabel('Section')
plt.ylabel('Sentiment Score')
plt.grid(True)
# Add a caption below the plot
caption = "Figure 1: Sentiment analysis scores for each section of the document. Notice that the document begins with positive sentiment. This fishes the reader in, makes them feel ok. People like happy. Then they drop the hammer of fear and hate."
plt.figtext(0.5, -0.1, caption, wrap=True, horizontalalignment='center', fontsize=12)
plt.savefig(output_image_path, bbox_inches='tight')
plt.show()
if __name__ == "__main__":
pdf_path = './2025_MandateForLeadership_FULL.pdf'
output_image_path = 'sentiment_analysis_by_section.png'
num_sections = 1000 # Adjust the number of sections as needed
text = read_pdf(pdf_path)
sections = split_into_sections(text, num_sections)
sentiment_scores = analyze_sentiment(sections)
visualize_sentiment(sentiment_scores, output_image_path)