-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit cee7fa0
Showing
16 changed files
with
377 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from flask import Flask | ||
from routes import main_bp | ||
|
||
app = Flask(__name__) | ||
app.register_blueprint(main_bp) | ||
|
||
if __name__ == "__main__": | ||
app.run(debug=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import numpy as np | ||
from datasets import load_from_disk | ||
from transformers import AutoTokenizer, AutoModelForMaskedLM | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.metrics import accuracy_score, classification_report | ||
import torch | ||
|
||
# Load the tokenizer and the quantized model | ||
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref") | ||
quantized_model = torch.quantization.quantize_dynamic( | ||
AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref"), | ||
{torch.nn.Linear}, | ||
dtype=torch.qint8 | ||
) | ||
|
||
# Load the dataset | ||
dataset_path = 'data/modified_dataset' | ||
dataset = load_from_disk(dataset_path) | ||
|
||
# Access the 'train' and 'test' splits from the dataset | ||
train_sequences = dataset["train"]["sequence"] | ||
train_labels = dataset["train"]["label"] | ||
test_sequences = dataset["test"]["sequence"] | ||
test_labels = dataset["test"]["label"] | ||
|
||
# Determine the number of samples for 0.6% of the data | ||
train_samples = int(len(train_sequences) * 0.006) | ||
test_samples = int(len(test_sequences) * 0.006) | ||
|
||
# Randomly select 0.6% of the data for train and test splits | ||
train_indices = np.random.choice(len(train_sequences), train_samples, replace=False) | ||
test_indices = np.random.choice(len(test_sequences), test_samples, replace=False) | ||
|
||
# Select the samples using the indices | ||
train_sequences = [train_sequences[i] for i in train_indices] | ||
train_labels = [train_labels[i] for i in train_indices] | ||
|
||
test_sequences = [test_sequences[i] for i in test_indices] | ||
test_labels = [test_labels[i] for i in test_indices] | ||
|
||
def get_embeddings(sequences, batch_size, tokenizer, quantized_model): | ||
max_length = tokenizer.model_max_length | ||
num_sequences = len(sequences) | ||
num_batches = (num_sequences + batch_size - 1) // batch_size | ||
|
||
all_embeddings = [] | ||
for i in range(num_batches): | ||
start_idx = i * batch_size | ||
end_idx = min((i + 1) * batch_size, num_sequences) | ||
batch_sequences = sequences[start_idx:end_idx] | ||
|
||
tokens_ids = tokenizer.batch_encode_plus( | ||
batch_sequences, | ||
return_tensors="pt", | ||
padding="max_length", | ||
max_length=max_length | ||
)["input_ids"] | ||
|
||
# Embeddings generation using quantized model | ||
attention_mask = tokens_ids != tokenizer.pad_token_id | ||
torch_outs = quantized_model( | ||
tokens_ids, | ||
attention_mask=attention_mask, | ||
encoder_attention_mask=attention_mask, | ||
output_hidden_states=True | ||
) | ||
embeddings = torch_outs['hidden_states'][-1].detach().numpy() | ||
|
||
# Mean sequence embeddings calculation | ||
attention_mask = torch.unsqueeze(attention_mask, dim=-1) | ||
mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1) | ||
|
||
all_embeddings.append(mean_sequence_embeddings) | ||
|
||
# Concatenate embeddings from all batches | ||
mean_sequence_embeddings = np.concatenate(all_embeddings, axis=0) | ||
|
||
return mean_sequence_embeddings | ||
|
||
# Define batch size | ||
batch_size = 6 | ||
|
||
# Tokenization and Embedding generation for train and test sequences | ||
train_embeddings = get_embeddings(train_sequences, batch_size, tokenizer, quantized_model) | ||
test_embeddings = get_embeddings(test_sequences, batch_size, tokenizer, quantized_model) | ||
|
||
# Convert tensors to numpy arrays | ||
X_train, y_train = train_embeddings, train_labels | ||
X_test, y_test = test_embeddings, test_labels | ||
|
||
# Training the classifier | ||
classifier = LogisticRegression(random_state=42) | ||
classifier.fit(X_train, y_train) | ||
|
||
# Predicting on the evaluation set | ||
y_pred = classifier.predict(X_test) | ||
|
||
# Evaluating the classifier | ||
accuracy = accuracy_score(y_test, y_pred) | ||
report = classification_report(y_test, y_pred) | ||
|
||
def classify_sequence(sequence): | ||
embedding = get_embeddings([sequence], 1, tokenizer, quantized_model) | ||
prediction = classifier.predict(embedding) | ||
return prediction[0], accuracy | ||
|
||
def get_model_accuracy(): | ||
return accuracy, report |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"splits": ["train", "test"]} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"citation": "", | ||
"description": "", | ||
"features": { | ||
"sequence": { | ||
"dtype": "string", | ||
"_type": "Value" | ||
}, | ||
"label": { | ||
"dtype": "int32", | ||
"_type": "Value" | ||
} | ||
}, | ||
"homepage": "", | ||
"license": "" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"_data_files": [ | ||
{ | ||
"filename": "data-00000-of-00001.arrow" | ||
} | ||
], | ||
"_fingerprint": "b22e591ebc6cccfe", | ||
"_format_columns": null, | ||
"_format_kwargs": {}, | ||
"_format_type": null, | ||
"_output_all_columns": false, | ||
"_split": null | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"citation": "", | ||
"description": "", | ||
"features": { | ||
"sequence": { | ||
"dtype": "string", | ||
"_type": "Value" | ||
}, | ||
"label": { | ||
"dtype": "int32", | ||
"_type": "Value" | ||
} | ||
}, | ||
"homepage": "", | ||
"license": "" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"_data_files": [ | ||
{ | ||
"filename": "data-00000-of-00001.arrow" | ||
} | ||
], | ||
"_fingerprint": "52c117b7141708a3", | ||
"_format_columns": null, | ||
"_format_kwargs": {}, | ||
"_format_type": null, | ||
"_output_all_columns": false, | ||
"_split": null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import numpy as np | ||
from datasets import load_from_disk | ||
from transformers import AutoTokenizer, AutoModelForMaskedLM | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.metrics import accuracy_score, classification_report | ||
import torch | ||
|
||
# Define the get_embeddings function | ||
def get_embeddings(sequences, batch_size, tokenizer, quantized_model): | ||
max_length = tokenizer.model_max_length | ||
num_sequences = len(sequences) | ||
num_batches = (num_sequences + batch_size - 1) // batch_size | ||
|
||
all_embeddings = [] | ||
for i in range(num_batches): | ||
start_idx = i * batch_size | ||
end_idx = min((i + 1) * batch_size, num_sequences) | ||
batch_sequences = sequences[start_idx:end_idx] | ||
|
||
tokens_ids = tokenizer.batch_encode_plus( | ||
batch_sequences, | ||
return_tensors="pt", | ||
padding="max_length", | ||
max_length=max_length | ||
)["input_ids"] | ||
|
||
# Embeddings generation using quantized model | ||
attention_mask = tokens_ids != tokenizer.pad_token_id | ||
torch_outs = quantized_model( | ||
tokens_ids, | ||
attention_mask=attention_mask, | ||
encoder_attention_mask=attention_mask, | ||
output_hidden_states=True | ||
) | ||
embeddings = torch_outs['hidden_states'][-1].detach().numpy() | ||
|
||
# Mean sequence embeddings calculation | ||
attention_mask = torch.unsqueeze(attention_mask, dim=-1) | ||
mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1) | ||
|
||
all_embeddings.append(mean_sequence_embeddings) | ||
|
||
# Concatenate embeddings from all batches | ||
mean_sequence_embeddings = np.concatenate(all_embeddings, axis=0) | ||
|
||
return mean_sequence_embeddings | ||
|
||
# Load the tokenizer and the quantized model | ||
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref") | ||
quantized_model = torch.quantization.quantize_dynamic( | ||
AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref"), | ||
{torch.nn.Linear}, | ||
dtype=torch.qint8 | ||
) | ||
|
||
# Load your dataset | ||
dataset_path = r"C:\Users\USER\Downloads\InstaDeep Models\modified_dataset" | ||
dataset = load_from_disk(dataset_path) | ||
|
||
# Access the 'train' and 'test' splits from the dataset | ||
train_sequences = dataset["train"]["sequence"] | ||
train_labels = dataset["train"]["label"] | ||
|
||
test_sequences = dataset["test"]["sequence"] | ||
test_labels = dataset["test"]["label"] | ||
|
||
# Determine the number of samples for 5% of the data | ||
train_samples = int(len(train_sequences) * 0.006) | ||
test_samples = int(len(test_sequences) * 0.006) | ||
|
||
# Randomly select 5% of the data for train and test splits | ||
train_indices = np.random.choice(len(train_sequences), train_samples, replace=False) | ||
test_indices = np.random.choice(len(test_sequences), test_samples, replace=False) | ||
|
||
# Select the samples using the indices | ||
train_sequences = [train_sequences[i] for i in train_indices] | ||
train_labels = [train_labels[i] for i in train_indices] | ||
|
||
test_sequences = [test_sequences[i] for i in test_indices] | ||
test_labels = [test_labels[i] for i in test_indices] | ||
|
||
# Display the number of rows in the train and test splits | ||
print("Number of rows in train split:", len(train_sequences)) | ||
print("Number of rows in test split:", len(test_sequences)) | ||
|
||
# print(train_sequences[:5]) | ||
# Define batch size | ||
batch_size = 6 | ||
|
||
# Tokenization and Embedding generation for train sequences | ||
train_embeddings = get_embeddings(train_sequences, batch_size, tokenizer, quantized_model) | ||
|
||
# Tokenization and Embedding generation for test sequences | ||
test_embeddings = get_embeddings(test_sequences, batch_size, tokenizer, quantized_model) | ||
|
||
# Convert tensors to numpy arrays | ||
X_train, y_train = train_embeddings, train_labels | ||
X_test, y_test = test_embeddings, test_labels | ||
|
||
# Training the classifier | ||
classifier = LogisticRegression(random_state=42) | ||
classifier.fit(X_train, y_train) | ||
|
||
# Predicting on the evaluation set | ||
y_pred = classifier.predict(X_test) | ||
|
||
print(X_test,y_pred) | ||
# Evaluating the classifier | ||
print("Accuracy:", accuracy_score(y_test, y_pred)) | ||
print("\nClassification Report:\n", classification_report(y_test, y_pred)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from flask import Blueprint, render_template, request | ||
from controller import classify_sequence, get_model_accuracy | ||
|
||
main_bp = Blueprint('main', __name__) | ||
|
||
@main_bp.route('/') | ||
def index(): | ||
return render_template('index.html') | ||
|
||
@main_bp.route('/classify', methods=['POST']) | ||
def classify(): | ||
sequence = request.form['sequence'] | ||
prediction, accuracy = classify_sequence(sequence) | ||
return render_template('result.html', sequence=sequence, prediction=prediction, accuracy=accuracy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
body { | ||
font-family: Arial, sans-serif; | ||
margin: 40px; | ||
} | ||
|
||
h1 { | ||
color: #333; | ||
} | ||
|
||
form { | ||
margin-top: 20px; | ||
} | ||
|
||
label { | ||
font-weight: bold; | ||
} | ||
|
||
textarea { | ||
width: 100%; | ||
font-family: monospace; | ||
} | ||
|
||
input[type="submit"] { | ||
margin-top: 20px; | ||
padding: 10px 20px; | ||
font-size: 16px; | ||
} | ||
|
||
p { | ||
font-size: 18px; | ||
} | ||
|
||
a { | ||
margin-top: 20px; | ||
display: inline-block; | ||
font-size: 16px; | ||
color: #0066cc; | ||
text-decoration: none; | ||
} | ||
|
||
a:hover { | ||
text-decoration: underline; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
<title>DNA Sequence Classification</title> | ||
<link rel="stylesheet" href="/static/styles.css"> | ||
</head> | ||
<body> | ||
<h1>DNA Sequence Classification</h1> | ||
<form action="/classify" method="post"> | ||
<label for="sequence">DNA Sequence:</label><br> | ||
<textarea id="sequence" name="sequence" rows="4" cols="50"></textarea><br><br> | ||
<input type="submit" value="Classify"> | ||
</form> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
<title>Classification Result</title> | ||
<link rel="stylesheet" href="/static/styles.css"> | ||
</head> | ||
<body> | ||
<h1>Classification Result</h1> | ||
<p><strong>Input Sequence:</strong></p> | ||
<p>{{ sequence }}</p> | ||
<p><strong>Prediction:</strong> {{ prediction }}</p> | ||
<p><strong>Model Accuracy:</strong> {{ accuracy }}</p> | ||
<a href="/">Go back</a> | ||
</body> | ||
</html> |