-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_visual.py
161 lines (136 loc) · 4.98 KB
/
ml_visual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import streamlit as st
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
def predict(text):
# Label mapping for predictions
mapped = [
'Sciences',
'Health and Medicine',
'Engineering and Technology',
'Arts and Social Sciences and Humanities',
'Mathematics and Multidisciplinary',
'Economic and Business and Finance'
]
# Clear GPU cache
torch.cuda.empty_cache()
# Use the Hugging Face model directly
model_name = "KTAP8/GopherSubjectArea"
# Load the pre-trained model from Hugging Face
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Load the tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Determine the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Tokenize the input text
inputs = tokenizer(
[text],
truncation=True, # Truncate inputs longer than max_length
padding="max_length", # Pad inputs shorter than max_length
max_length=512, # Ensure compatibility with the trained model
return_tensors="pt" # Return PyTorch tensors
)
# Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}
# Perform inference
model.eval()
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=-1).tolist()
# Map predicted label index to the corresponding label
return mapped[int(predicted_labels[0])]
# Title and subheading
st.title("Machine Learning Module")
st.subheader("Predict Subject Area from Abstract")
# Layout: Text box with submit button next to it
col1, col2 = st.columns([3, 1]) # Wider column for the text box
with col1:
# Expanding text box
abstract_text = st.text_area(
"Enter the abstract below:",
height=150,
help="The text box will expand as you type more content."
)
with col2:
# Submit button
submit = st.button("Submit")
# Placeholder for results
result_placeholder = st.empty()
# Function call and loading animation
if submit:
if abstract_text.strip():
with st.spinner("Predicting..."):
# Call your predict function here
result = predict(abstract_text) # Assume predict is defined elsewhere
# Display the result
result_placeholder.markdown(f"### Prediction: {result}")
else:
st.error("Please enter an abstract before submitting!")
metrics = {
'eval_accuracy': 0.8483373884833739,
'eval_f1': 0.8484789634216419,
'eval_precision': 0.8486998899398802,
'eval_recall': 0.8483373884833739
}
# Prepare metrics for table display
formatted_metrics = [
{"Metric": metric, "Value (%)": f"{value * 100:.2f}"}
for metric, value in metrics.items()
]
# Display metrics as a table
st.subheader("Evaluation Metrics (model performance)")
st.table(formatted_metrics)
# Generalized fields data
generalized_fields = {
"Sciences": [
"AGRI", # Agricultural and Biological Sciences
"BIOC", # Biochemistry, Genetics and Molecular Biology
"EART", # Earth and Planetary Sciences
"ENVI", # Environmental Science
"MATE", # Materials Science
"PHYS", # Physics and Astronomy
"CHEM" # Chemistry
],
"Health and Medicine": [
"DENT", # Dentistry
"HEAL", # Health Professions
"IMMU", # Immunology and Microbiology
"MEDI", # Medicine
"NEUR", # Neuroscience
"NURS", # Nursing
"PHAR", # Pharmacology, Toxicology and Pharmaceutics
"VETE" # Veterinary
],
"Engineering and Technology": [
"CENG", # Chemical Engineering
"COMP", # Computer Science
"ENER", # Energy
"ENGI" # Engineering
],
"Arts and Social Sciences and Humanities": [
"ARTS", # Arts and Humanities
"DECI", # Decision Sciences
"PSYC", # Psychology
"SOCI" # Social Sciences
],
"Mathematics and Multidisciplinary": [
"MATH", # Mathematics
"MULT" # Multidisciplinary
],
"Economic and Business and Finance": [
"BUSI", # Business, Management and Accounting
"ECON", # Economics, Econometrics and Finance
]
}
# Add an expander for the generalized field guide
with st.expander("Generalized Field Guide"):
st.write("Below is the mapping of generalized fields to their respective subfields:")
for field, subfields in generalized_fields.items():
st.markdown(f"**{field}:**")
for subfield in subfields:
st.write(f"- {subfield}")