Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Firas-HadjKacem committed May 31, 2024
0 parents commit cee7fa0
Show file tree
Hide file tree
Showing 16 changed files with 377 additions and 0 deletions.
Binary file added __pycache__/controller.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/routes.cpython-312.pyc
Binary file not shown.
8 changes: 8 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from flask import Flask
from routes import main_bp

app = Flask(__name__)
app.register_blueprint(main_bp)

if __name__ == "__main__":
app.run(debug=True)
108 changes: 108 additions & 0 deletions controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import torch

# Load the tokenizer and the quantized model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
quantized_model = torch.quantization.quantize_dynamic(
AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref"),
{torch.nn.Linear},
dtype=torch.qint8
)

# Load the dataset
dataset_path = 'data/modified_dataset'
dataset = load_from_disk(dataset_path)

# Access the 'train' and 'test' splits from the dataset
train_sequences = dataset["train"]["sequence"]
train_labels = dataset["train"]["label"]
test_sequences = dataset["test"]["sequence"]
test_labels = dataset["test"]["label"]

# Determine the number of samples for 0.6% of the data
train_samples = int(len(train_sequences) * 0.006)
test_samples = int(len(test_sequences) * 0.006)

# Randomly select 0.6% of the data for train and test splits
train_indices = np.random.choice(len(train_sequences), train_samples, replace=False)
test_indices = np.random.choice(len(test_sequences), test_samples, replace=False)

# Select the samples using the indices
train_sequences = [train_sequences[i] for i in train_indices]
train_labels = [train_labels[i] for i in train_indices]

test_sequences = [test_sequences[i] for i in test_indices]
test_labels = [test_labels[i] for i in test_indices]

def get_embeddings(sequences, batch_size, tokenizer, quantized_model):
max_length = tokenizer.model_max_length
num_sequences = len(sequences)
num_batches = (num_sequences + batch_size - 1) // batch_size

all_embeddings = []
for i in range(num_batches):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, num_sequences)
batch_sequences = sequences[start_idx:end_idx]

tokens_ids = tokenizer.batch_encode_plus(
batch_sequences,
return_tensors="pt",
padding="max_length",
max_length=max_length
)["input_ids"]

# Embeddings generation using quantized model
attention_mask = tokens_ids != tokenizer.pad_token_id
torch_outs = quantized_model(
tokens_ids,
attention_mask=attention_mask,
encoder_attention_mask=attention_mask,
output_hidden_states=True
)
embeddings = torch_outs['hidden_states'][-1].detach().numpy()

# Mean sequence embeddings calculation
attention_mask = torch.unsqueeze(attention_mask, dim=-1)
mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)

all_embeddings.append(mean_sequence_embeddings)

# Concatenate embeddings from all batches
mean_sequence_embeddings = np.concatenate(all_embeddings, axis=0)

return mean_sequence_embeddings

# Define batch size
batch_size = 6

# Tokenization and Embedding generation for train and test sequences
train_embeddings = get_embeddings(train_sequences, batch_size, tokenizer, quantized_model)
test_embeddings = get_embeddings(test_sequences, batch_size, tokenizer, quantized_model)

# Convert tensors to numpy arrays
X_train, y_train = train_embeddings, train_labels
X_test, y_test = test_embeddings, test_labels

# Training the classifier
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

# Predicting on the evaluation set
y_pred = classifier.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

def classify_sequence(sequence):
embedding = get_embeddings([sequence], 1, tokenizer, quantized_model)
prediction = classifier.predict(embedding)
return prediction[0], accuracy

def get_model_accuracy():
return accuracy, report
1 change: 1 addition & 0 deletions data/modified_dataset/dataset_dict.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"splits": ["train", "test"]}
Binary file not shown.
16 changes: 16 additions & 0 deletions data/modified_dataset/test/dataset_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"citation": "",
"description": "",
"features": {
"sequence": {
"dtype": "string",
"_type": "Value"
},
"label": {
"dtype": "int32",
"_type": "Value"
}
},
"homepage": "",
"license": ""
}
13 changes: 13 additions & 0 deletions data/modified_dataset/test/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"_data_files": [
{
"filename": "data-00000-of-00001.arrow"
}
],
"_fingerprint": "b22e591ebc6cccfe",
"_format_columns": null,
"_format_kwargs": {},
"_format_type": null,
"_output_all_columns": false,
"_split": null
}
Binary file not shown.
16 changes: 16 additions & 0 deletions data/modified_dataset/train/dataset_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"citation": "",
"description": "",
"features": {
"sequence": {
"dtype": "string",
"_type": "Value"
},
"label": {
"dtype": "int32",
"_type": "Value"
}
},
"homepage": "",
"license": ""
}
13 changes: 13 additions & 0 deletions data/modified_dataset/train/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"_data_files": [
{
"filename": "data-00000-of-00001.arrow"
}
],
"_fingerprint": "52c117b7141708a3",
"_format_columns": null,
"_format_kwargs": {},
"_format_type": null,
"_output_all_columns": false,
"_split": null
}
111 changes: 111 additions & 0 deletions non_routed_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import torch

# Define the get_embeddings function
def get_embeddings(sequences, batch_size, tokenizer, quantized_model):
max_length = tokenizer.model_max_length
num_sequences = len(sequences)
num_batches = (num_sequences + batch_size - 1) // batch_size

all_embeddings = []
for i in range(num_batches):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, num_sequences)
batch_sequences = sequences[start_idx:end_idx]

tokens_ids = tokenizer.batch_encode_plus(
batch_sequences,
return_tensors="pt",
padding="max_length",
max_length=max_length
)["input_ids"]

# Embeddings generation using quantized model
attention_mask = tokens_ids != tokenizer.pad_token_id
torch_outs = quantized_model(
tokens_ids,
attention_mask=attention_mask,
encoder_attention_mask=attention_mask,
output_hidden_states=True
)
embeddings = torch_outs['hidden_states'][-1].detach().numpy()

# Mean sequence embeddings calculation
attention_mask = torch.unsqueeze(attention_mask, dim=-1)
mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)

all_embeddings.append(mean_sequence_embeddings)

# Concatenate embeddings from all batches
mean_sequence_embeddings = np.concatenate(all_embeddings, axis=0)

return mean_sequence_embeddings

# Load the tokenizer and the quantized model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
quantized_model = torch.quantization.quantize_dynamic(
AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref"),
{torch.nn.Linear},
dtype=torch.qint8
)

# Load your dataset
dataset_path = r"C:\Users\USER\Downloads\InstaDeep Models\modified_dataset"
dataset = load_from_disk(dataset_path)

# Access the 'train' and 'test' splits from the dataset
train_sequences = dataset["train"]["sequence"]
train_labels = dataset["train"]["label"]

test_sequences = dataset["test"]["sequence"]
test_labels = dataset["test"]["label"]

# Determine the number of samples for 5% of the data
train_samples = int(len(train_sequences) * 0.006)
test_samples = int(len(test_sequences) * 0.006)

# Randomly select 5% of the data for train and test splits
train_indices = np.random.choice(len(train_sequences), train_samples, replace=False)
test_indices = np.random.choice(len(test_sequences), test_samples, replace=False)

# Select the samples using the indices
train_sequences = [train_sequences[i] for i in train_indices]
train_labels = [train_labels[i] for i in train_indices]

test_sequences = [test_sequences[i] for i in test_indices]
test_labels = [test_labels[i] for i in test_indices]

# Display the number of rows in the train and test splits
print("Number of rows in train split:", len(train_sequences))
print("Number of rows in test split:", len(test_sequences))

# print(train_sequences[:5])
# Define batch size
batch_size = 6

# Tokenization and Embedding generation for train sequences
train_embeddings = get_embeddings(train_sequences, batch_size, tokenizer, quantized_model)

# Tokenization and Embedding generation for test sequences
test_embeddings = get_embeddings(test_sequences, batch_size, tokenizer, quantized_model)

# Convert tensors to numpy arrays
X_train, y_train = train_embeddings, train_labels
X_test, y_test = test_embeddings, test_labels

# Training the classifier
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

# Predicting on the evaluation set
y_pred = classifier.predict(X_test)

print(X_test,y_pred)
# Evaluating the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
14 changes: 14 additions & 0 deletions routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from flask import Blueprint, render_template, request
from controller import classify_sequence, get_model_accuracy

main_bp = Blueprint('main', __name__)

@main_bp.route('/')
def index():
return render_template('index.html')

@main_bp.route('/classify', methods=['POST'])
def classify():
sequence = request.form['sequence']
prediction, accuracy = classify_sequence(sequence)
return render_template('result.html', sequence=sequence, prediction=prediction, accuracy=accuracy)
43 changes: 43 additions & 0 deletions static/styles.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
body {
font-family: Arial, sans-serif;
margin: 40px;
}

h1 {
color: #333;
}

form {
margin-top: 20px;
}

label {
font-weight: bold;
}

textarea {
width: 100%;
font-family: monospace;
}

input[type="submit"] {
margin-top: 20px;
padding: 10px 20px;
font-size: 16px;
}

p {
font-size: 18px;
}

a {
margin-top: 20px;
display: inline-block;
font-size: 16px;
color: #0066cc;
text-decoration: none;
}

a:hover {
text-decoration: underline;
}
17 changes: 17 additions & 0 deletions templates/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>DNA Sequence Classification</title>
<link rel="stylesheet" href="/static/styles.css">
</head>
<body>
<h1>DNA Sequence Classification</h1>
<form action="/classify" method="post">
<label for="sequence">DNA Sequence:</label><br>
<textarea id="sequence" name="sequence" rows="4" cols="50"></textarea><br><br>
<input type="submit" value="Classify">
</form>
</body>
</html>
17 changes: 17 additions & 0 deletions templates/result.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Classification Result</title>
<link rel="stylesheet" href="/static/styles.css">
</head>
<body>
<h1>Classification Result</h1>
<p><strong>Input Sequence:</strong></p>
<p>{{ sequence }}</p>
<p><strong>Prediction:</strong> {{ prediction }}</p>
<p><strong>Model Accuracy:</strong> {{ accuracy }}</p>
<a href="/">Go back</a>
</body>
</html>

0 comments on commit cee7fa0

Please sign in to comment.