bertmodel.py

# -*- coding: utf-8 -*-
"""bertmodel.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1sS2viacXRbjrU8X3qANQIiHi0KXaR4sD
"""

!pip install transformers numpy pandas
!pip install datasets
import pandas as pd

# Commented out IPython magic to ensure Python compatibility.

import warnings
warnings.filterwarnings('ignore')
# %config Completer.use_jedi = False # if autocompletion doesnot work in kaggle notebook  | hit tab

import pandas as pd

df = pd.read_csv("https://huggingface.co/datasets/financial_phrasebank",encoding = "ISO-8859-1",on_bad_lines='skip',sep = " .@")
#df.columns = ['News_sentences','sentiment']
df.head()

from datasets import load_dataset
import pandas as pd

# Load the dataset from Hugging Face
dataset = load_dataset("financial_phrasebank","sentences_50agree",cache_dir=None)

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(dataset['train'])
df.columns = ['Sentences','sentiment']

# Print the first few rows of the DataFrame
print(df.head())

df['sentiment'] = df.sentiment.astype('category')

!pip install text_hammer
import text_hammer as th
from tqdm import *

def text_preprocessing(df,col_name):
  column = col_name
  df[column] = df[column].progress_apply(lambda x:str(x).lower())
  df[column] = df[column].progress_apply(lambda x:th.cont_exp(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_emails(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_html_tags(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_special_chars(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_accented_chars(x))

  return (df)

df = text_preprocessing(df,'Sentences')

df['words'] = df.Sentences.apply(lambda x:len(x.split()))
df= df[df['sentiment'].isnull() == False]
df.head()
df.shape

df.words.max()

from sklearn.model_selection import train_test_split
train,test = train_test_split(df, test_size = 0.3, random_state = 42, stratify = df.sentiment)
train.shape

!pip install tensorflow keras

from tensorflow.keras.utils import to_categorical
train.head()

from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

tokenizer.save_pretrained('bert-tokenizer')
bert.save_pretrained('bert-model')
# for saving model locally and we can load it later o

import shutil
shutil.make_archive('bert-tokenizer', 'zip', 'bert-tokenizer')

tokenizer('hello I am Aishwarya')

x_train = tokenizer(
    text=train.Sentences.tolist(),
    add_special_tokens=True,
    max_length=60,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)


x_test = tokenizer(
    text=test.Sentences.tolist(),
    add_special_tokens=True,
    max_length=60,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

x_train

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

max_len = 60
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
out = Dense(16,activation = 'relu')(out)
y = Dense(3,activation = 'softmax')(out)

    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay

# Define your model architecture here

initial_learning_rate = 5e-5
decay_steps = 3 * len(x_train) // 36 # decay the learning rate every 3 epochs
end_learning_rate = 1e-7
power = 1.0

lr_schedule = PolynomialDecay(
    initial_learning_rate,
    decay_steps,
    end_learning_rate,
    power
)

optimizer = Adam(
    learning_rate=lr_schedule,
    epsilon=1e-08,
    clipnorm=1.0)

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy(name='accuracy')

model.compile(
    optimizer=optimizer,
    loss=loss, 
    metrics=metric)

train_history = model.fit(
    x={'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
    y=to_categorical(train.sentiment),
    validation_data=(
        {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, 
        to_categorical(test.sentiment)
    ),
    epochs=3,
    batch_size=36
)

model.save_weights('sentiment_weights.h5')
metrics_df = pd.DataFrame(train_history.history)
print(metrics_df)