-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsummarize.py
73 lines (58 loc) · 2.45 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from transformers import BartForConditionalGeneration, BartTokenizer
# Load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained(
'facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained(
'facebook/bart-large-cnn')
def summarize(text, maxSummarylength=500):
# Encode the text and summarize
inputs = tokenizer.encode("summarize: " +
text,
return_tensors="pt",
max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=maxSummarylength,
min_length=int(maxSummarylength/5),
length_penalty=10.0,
num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def split_text_into_pieces(text,
max_tokens=900,
overlapPercent=10):
# Tokenize the text
tokens = tokenizer.tokenize(text)
# Calculate the overlap in tokens
overlap_tokens = int(max_tokens * overlapPercent / 100)
# Split the tokens into chunks of size
# max_tokens with overlap
pieces = [tokens[i:i + max_tokens]
for i in range(0, len(tokens),
max_tokens - overlap_tokens)]
# Convert the token pieces back into text
text_pieces = [tokenizer.decode(
tokenizer.convert_tokens_to_ids(piece),
skip_special_tokens=True) for piece in pieces]
return text_pieces
def recursive_summarize(text, max_length=400):
tokens = tokenizer.tokenize(text)
expectedCountOfChunks = len(tokens)/max_length
max_length=int(len(tokens)/expectedCountOfChunks)+2
# Break the text into pieces of max_length
pieces = split_text_into_pieces(text, max_tokens=max_length)
print("Number of pieces: ", len(pieces))
# Summarize each piece
summaries=[]
k=0
for k in range(0, len(pieces)):
piece=pieces[k]
print("")
print("Piece:",(k+1)," out of ", len(pieces), "pieces")
print(piece, "\n")
summary =summarize(piece, maxSummarylength=max_length/3*2)
print("SUMNMARY: ", summary)
summaries.append(summary)
print("")
concatenated_summary = ' '.join(summaries)
tokens = tokenizer.tokenize(concatenated_summary)
final_summary=concatenated_summary
return final_summary