Skip to content

Commit

Permalink
Pre release updates (#2)
Browse files Browse the repository at this point in the history
* Switch to Tuple for type hint

* Update link to articles in README.md

* Expand docstrings in fine-tune.py

---------

Co-authored-by: Ehssan <>
  • Loading branch information
ekintel authored Jan 17, 2025
1 parent 1322c2a commit 553432a
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 11 deletions.
4 changes: 2 additions & 2 deletions data-generator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ prompt_examples = "Examples for the Few-Shot Chain-of-Thought prompt."

### `read_token() -> None`

Reads a Hugging Face token from a file named 'token.txt' and logs in using the token. (See (#article) to learn how to create and access your Hugging Face token.)
Reads a Hugging Face token from a file named 'token.txt' and logs in using the token. (See [Article](#article) to learn how to create and access your Hugging Face token.)

The file is expected to be located in the same directory as the script. If the file is missing, inaccessible, or another error occurs, the program will terminate with an appropriate error message.

#### Raises:
- `SystemExit`: If the token file is not found, permission is denied, or any other error occurs while reading the file.

### `parse_string(input_string: str) -> tuple[str, str]`
### `parse_string(input_string: str) -> Tuple[str, str]`

Parses a string containing `OUTPUT:` and `REASONING:` sections and extracts their values.

Expand Down
6 changes: 3 additions & 3 deletions data-generator/sdg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import sys
from datetime import datetime
from typing import Dict, List
from typing import Dict, List, Tuple

import pandas as pd
from huggingface_hub import login
Expand Down Expand Up @@ -45,15 +45,15 @@ def read_token() -> None:
login(token)


def parse_string(input_string: str) -> tuple[str, str]:
def parse_string(input_string: str) -> Tuple[str, str]:
"""
Parses a string containing `OUTPUT:` and `REASONING:` sections and extracts their values.
Args:
input_string (str): The input string containing `OUTPUT:` and `REASONING:` labels.
Returns:
tuple[str, str]: A tuple containing two strings:
Tuple[str, str]: A tuple containing two strings:
- The content following `OUTPUT:`.
- The content following `REASONING:`.
Expand Down
96 changes: 90 additions & 6 deletions fine-tuner/fine-tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,32 @@ def __init__(
tokenizer: PreTrainedTokenizer,
max_length: int = 512,
):
"""
Initialize the class with data, tokenizer, and optional max_length.
Args:
data (Union[pd.DataFrame, List[str]]): The input data, either as a pandas DataFrame or a list of strings.
tokenizer (PreTrainedTokenizer): The tokenizer used for processing text data.
max_length (int, optional): The maximum length for tokenized sequences. Defaults to 512.
Attributes:
data: The input data, either a pandas DataFrame or a list of strings.
tokenizer: The tokenizer for processing text.
max_length: The maximum length for tokenized sequences.
is_dataframe (bool): A flag indicating whether the data is a pandas DataFrame (True) or a list of strings (False).
"""
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
self.is_dataframe = hasattr(data, "iloc") # Check if the data is a dataframe

def __len__(self) -> int:
"""
Return the number of elements in the data.
Returns:
int: The length of the dataframe or list.
"""
return len(self.data)

def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
Expand Down Expand Up @@ -334,6 +354,28 @@ def __init__(
weight_decay: float = 0.01,
num_labels: int = 4,
):
"""
Initialize the model for sequence classification with hyperparameters and metrics.
Args:
model (AutoModelForSequenceClassification): The pre-trained model for sequence classification.
num_training_steps (int): The total number of training steps.
learning_rate (float, optional): The learning rate for the optimizer. Defaults to 5e-5.
weight_decay (float, optional): The weight decay for the optimizer. Defaults to 0.01.
num_labels (int, optional): The number of labels for classification. Defaults to 4.
Attributes:
model: The pre-trained model for sequence classification.
num_training_steps: The total number of training steps.
learning_rate: The learning rate for the optimizer.
weight_decay: The weight decay for the optimizer.
num_labels: The number of labels for classification.
f1_score: The F1 score metric for evaluation, weighted by class.
val_f1: The F1 score metric for validation, weighted by class.
test_f1: The F1 score metric for test, weighted by class.
val_acc: The accuracy metric for validation.
test_acc: The accuracy metric for test.
"""
super().__init__()
self.save_hyperparameters(ignore=["model"])
self.model = model
Expand Down Expand Up @@ -364,14 +406,25 @@ def forward(
attention_mask: torch.Tensor,
labels: Optional[torch.Tensor] = None,
) -> Dict:
"""
Forward pass through the model.
Args:
input_ids (torch.Tensor): Input tensor containing token IDs.
attention_mask (torch.Tensor): Tensor indicating the attention mask for padding tokens.
labels (Optional[torch.Tensor], optional): Labels for computing loss. Default is None.
Returns:
Dict: Output of the model.
"""
return self.model(input_ids, attention_mask=attention_mask, labels=labels)

def step(self, batch: dict, stage: str) -> Optional[torch.Tensor]:
def step(self, batch: Dict, stage: str) -> Optional[torch.Tensor]:
"""
A single step function for training, validation, and testing.
Args:
batch (dict): A batch of data containing 'input_ids', 'attention_mask', and 'label'.
batch (Dict): A batch of data containing 'input_ids', 'attention_mask', and 'label'.
stage (str): One of "train", "val", or "test" to indicate the current phase.
Returns:
Expand Down Expand Up @@ -413,16 +466,47 @@ def log_metrics(self, stage: str):
self.log("test_acc", self.test_acc.compute(), prog_bar=True)
self.log("test_f1", self.test_f1.compute(), prog_bar=True)

def training_step(self, batch: dict, batch_idx: int) -> torch.Tensor:
def training_step(self, batch: Dict, batch_idx: int) -> torch.Tensor:
"""
Perform a training step.
Args:
batch (Dict): A batch of data, containing input tensors and labels.
batch_idx (int): The index of the batch.
Returns:
torch.Tensor: The loss value for this step that is sent to the optimizer.
"""
return self.step(batch, "train")

def validation_step(self, batch: dict, batch_idx: int) -> None:
def validation_step(self, batch: Dict, batch_idx: int) -> None:
"""
Perform a validation step.
Args:
batch (Dict): A batch of validation data.
batch_idx (int): The index of the batch.
"""
self.step(batch, "val")

def test_step(self, batch: dict, batch_idx: int) -> None:
def test_step(self, batch: Dict, batch_idx: int) -> None:
"""
Perform a test step.
Args:
batch (Dict): A batch of test data.
batch_idx (int): The index of the batch.
"""
self.step(batch, "test")

def configure_optimizers(self):
def configure_optimizers(self) -> Tuple[List[torch.optim.AdamW], List[Dict]]:
"""
Configure the optimizer and learning rate scheduler.
Returns:
Tuple[List[torch.optim.AdamW], List[Dict]]: A tuple containing a list of the optimizer(s)
and a list of learning rate scheduler configurations.
"""
optimizer = torch.optim.AdamW(
self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
)
Expand Down

0 comments on commit 553432a

Please sign in to comment.