distilbert.py

# -*- coding: utf-8 -*-
"""DISTILBERT.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1I3Je_S2kfxxq-YKdUk_GmiYg6VrKreLA
"""

!pip install transformers

import pandas as pd
import numpy as np

! pip install -q kaggle

import os
os.environ['KAGGLE_USERNAME'] = 'aishwaryaahuja'
os.environ['KAGGLE_KEY'] = 'f7cb3c6486bbef40a8748c4ac5eb6531'

!kaggle datasets download -d sbhatti/financial-sentiment-analysis -p /content

df = pd.read_csv('/content/financial-sentiment-analysis.zip')
df.head()

!pip install text_hammer
import text_hammer as th
from tqdm import *

def text_preprocessing(df,col_name):
  column = col_name
  df[column] = df[column].progress_apply(lambda x:str(x).lower())
  df[column] = df[column].progress_apply(lambda x:th.cont_exp(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_emails(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_html_tags(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_special_chars(x))
  df[column] = df[column].progress_apply(lambda x:th.remove_accented_chars(x))

  return (df)

df = text_preprocessing(df,'Sentence')

encoding={'neutral':1, 'negative':0, 'positive':2}
df['Sentiment']=df['Sentiment'].map(encoding)
df['words'] = df.Sentence.apply(lambda x:len(x.split()))
df= df[df['Sentiment'].isnull() == False]
df.head()

from sklearn.model_selection import train_test_split
train,test = train_test_split(df, test_size = 0.3, random_state = 42, stratify = df.Sentiment)
train.shape#

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer= AutoTokenizer.from_pretrained('distilbert-base-uncased', cache_dir='./all_models')

!pip install datasets
def preprocess_function(examples):
  return tokenizer(examples["Sentence"], truncation=True)
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
train_labels = train["Sentiment"].tolist()
test_labels = test["Sentiment"].tolist()

tokenized_train = tokenized_train.add_column("Label", train_labels)
tokenized_test = tokenized_test.add_column("Label", test_labels)

type(tokenized_train)

data_collator= DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification
from datasets import load_metric
from transformers import TrainingArguments, Trainer

from huggingface_hub import notebook_login

model= AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3, cache_dir='./all_models', return_dict=True)

from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
def compute_metrics(pred):
  labels= pred.labels_ids
  preds= pred.predictions.argmax(-1)
  precision, recall, f1, _ =precision_recall_fscore_support(labels, preds, average='weighted', pos_label=0)
  cm=confusion_matrix(labels,preds,labels=[0,1,2])
  acc=accuracy_score(labels,preds)
  return{
      'y_true': labels,
      'y_pred': preds,
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall,
      'confusion_matrix': cm
  }

notebook_login()

repo_name='sentiment_test_model'
training_args =TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    warmup_steps=100,
    logging_steps=100,
    fp16=False,
    push_to_hub=True
)

trainer= Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

class SentimentClassifier(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.classifier = nn.Linear(768, 2)
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)
        loss = self.loss_fn(logits, labels)
        return loss, logits

    def training_step(self, model, inputs):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = inputs["label"]

        loss, logits = self(input_ids, attention_mask, labels)

        return {"loss": loss, "logits": logits}

train_result=trainer.train()