Initial commit

Firas-HadjKacem · May 31, 2024 · cee7fa0 · cee7fa0
commit cee7fa0
Show file tree

Hide file tree

Showing 16 changed files with 377 additions and 0 deletions.
diff --git a/__pycache__/controller.cpython-312.pyc b/__pycache__/controller.cpython-312.pyc
diff --git a/__pycache__/routes.cpython-312.pyc b/__pycache__/routes.cpython-312.pyc
diff --git a/app.py b/app.py
@@ -0,0 +1,8 @@
+from flask import Flask
+from routes import main_bp
+
+app = Flask(__name__)
+app.register_blueprint(main_bp)
+
+if __name__ == "__main__":
+    app.run(debug=True)
diff --git a/controller.py b/controller.py
@@ -0,0 +1,108 @@
+import numpy as np
+from datasets import load_from_disk
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report
+import torch
+
+# Load the tokenizer and the quantized model
+tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
+quantized_model = torch.quantization.quantize_dynamic(
+    AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref"),
+    {torch.nn.Linear},
+    dtype=torch.qint8
+)
+
+# Load the dataset
+dataset_path = 'data/modified_dataset'
+dataset = load_from_disk(dataset_path)
+
+# Access the 'train' and 'test' splits from the dataset
+train_sequences = dataset["train"]["sequence"]
+train_labels = dataset["train"]["label"]
+test_sequences = dataset["test"]["sequence"]
+test_labels = dataset["test"]["label"]
+
+# Determine the number of samples for 0.6% of the data
+train_samples = int(len(train_sequences) * 0.006)
+test_samples = int(len(test_sequences) * 0.006)
+
+# Randomly select 0.6% of the data for train and test splits
+train_indices = np.random.choice(len(train_sequences), train_samples, replace=False)
+test_indices = np.random.choice(len(test_sequences), test_samples, replace=False)
+
+# Select the samples using the indices
+train_sequences = [train_sequences[i] for i in train_indices]
+train_labels = [train_labels[i] for i in train_indices]
+
+test_sequences = [test_sequences[i] for i in test_indices]
+test_labels = [test_labels[i] for i in test_indices]
+
+def get_embeddings(sequences, batch_size, tokenizer, quantized_model):
+    max_length = tokenizer.model_max_length
+    num_sequences = len(sequences)
+    num_batches = (num_sequences + batch_size - 1) // batch_size
+
+    all_embeddings = []
+    for i in range(num_batches):
+        start_idx = i * batch_size
+        end_idx = min((i + 1) * batch_size, num_sequences)
+        batch_sequences = sequences[start_idx:end_idx]
+
+        tokens_ids = tokenizer.batch_encode_plus(
+            batch_sequences,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length
+        )["input_ids"]
+
+        # Embeddings generation using quantized model
+        attention_mask = tokens_ids != tokenizer.pad_token_id
+        torch_outs = quantized_model(
+            tokens_ids,
+            attention_mask=attention_mask,
+            encoder_attention_mask=attention_mask,
+            output_hidden_states=True
+        )
+        embeddings = torch_outs['hidden_states'][-1].detach().numpy()
+
+        # Mean sequence embeddings calculation
+        attention_mask = torch.unsqueeze(attention_mask, dim=-1)
+        mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)
+
+        all_embeddings.append(mean_sequence_embeddings)
+
+    # Concatenate embeddings from all batches
+    mean_sequence_embeddings = np.concatenate(all_embeddings, axis=0)
+
+    return mean_sequence_embeddings
+
+# Define batch size
+batch_size = 6
+
+# Tokenization and Embedding generation for train and test sequences
+train_embeddings = get_embeddings(train_sequences, batch_size, tokenizer, quantized_model)
+test_embeddings = get_embeddings(test_sequences, batch_size, tokenizer, quantized_model)
+
+# Convert tensors to numpy arrays
+X_train, y_train = train_embeddings, train_labels
+X_test, y_test = test_embeddings, test_labels
+
+# Training the classifier
+classifier = LogisticRegression(random_state=42)
+classifier.fit(X_train, y_train)
+
+# Predicting on the evaluation set
+y_pred = classifier.predict(X_test)
+
+# Evaluating the classifier
+accuracy = accuracy_score(y_test, y_pred)
+report = classification_report(y_test, y_pred)
+
+def classify_sequence(sequence):
+    embedding = get_embeddings([sequence], 1, tokenizer, quantized_model)
+    prediction = classifier.predict(embedding)
+    return prediction[0], accuracy
+
+def get_model_accuracy():
+    return accuracy, report
diff --git a/data/modified_dataset/dataset_dict.json b/data/modified_dataset/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "test"]}
diff --git a/data/modified_dataset/test/data-00000-of-00001.arrow b/data/modified_dataset/test/data-00000-of-00001.arrow
diff --git a/data/modified_dataset/test/dataset_info.json b/data/modified_dataset/test/dataset_info.json
@@ -0,0 +1,16 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "dtype": "int32",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
diff --git a/data/modified_dataset/test/state.json b/data/modified_dataset/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b22e591ebc6cccfe",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
diff --git a/data/modified_dataset/train/data-00000-of-00001.arrow b/data/modified_dataset/train/data-00000-of-00001.arrow
diff --git a/data/modified_dataset/train/dataset_info.json b/data/modified_dataset/train/dataset_info.json
@@ -0,0 +1,16 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "dtype": "int32",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
diff --git a/data/modified_dataset/train/state.json b/data/modified_dataset/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "52c117b7141708a3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
diff --git a/non_routed_version.py b/non_routed_version.py
@@ -0,0 +1,111 @@
+import numpy as np
+from datasets import load_from_disk
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report
+import torch
+
+# Define the get_embeddings function
+def get_embeddings(sequences, batch_size, tokenizer, quantized_model):
+    max_length = tokenizer.model_max_length
+    num_sequences = len(sequences)
+    num_batches = (num_sequences + batch_size - 1) // batch_size
+
+    all_embeddings = []
+    for i in range(num_batches):
+        start_idx = i * batch_size
+        end_idx = min((i + 1) * batch_size, num_sequences)
+        batch_sequences = sequences[start_idx:end_idx]
+
+        tokens_ids = tokenizer.batch_encode_plus(
+            batch_sequences,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length
+        )["input_ids"]
+
+        # Embeddings generation using quantized model
+        attention_mask = tokens_ids != tokenizer.pad_token_id
+        torch_outs = quantized_model(
+            tokens_ids,
+            attention_mask=attention_mask,
+            encoder_attention_mask=attention_mask,
+            output_hidden_states=True
+        )
+        embeddings = torch_outs['hidden_states'][-1].detach().numpy()
+
+        # Mean sequence embeddings calculation
+        attention_mask = torch.unsqueeze(attention_mask, dim=-1)
+        mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)
+
+        all_embeddings.append(mean_sequence_embeddings)
+
+    # Concatenate embeddings from all batches
+    mean_sequence_embeddings = np.concatenate(all_embeddings, axis=0)
+
+    return mean_sequence_embeddings
+
+# Load the tokenizer and the quantized model
+tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
+quantized_model = torch.quantization.quantize_dynamic(
+    AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref"),
+    {torch.nn.Linear},
+    dtype=torch.qint8
+)
+
+# Load your dataset
+dataset_path = r"C:\Users\USER\Downloads\InstaDeep Models\modified_dataset"
+dataset = load_from_disk(dataset_path)
+
+# Access the 'train' and 'test' splits from the dataset
+train_sequences = dataset["train"]["sequence"]
+train_labels = dataset["train"]["label"]
+
+test_sequences = dataset["test"]["sequence"]
+test_labels = dataset["test"]["label"]
+
+# Determine the number of samples for 5% of the data
+train_samples = int(len(train_sequences) * 0.006)
+test_samples = int(len(test_sequences) * 0.006)
+
+# Randomly select 5% of the data for train and test splits
+train_indices = np.random.choice(len(train_sequences), train_samples, replace=False)
+test_indices = np.random.choice(len(test_sequences), test_samples, replace=False)
+
+# Select the samples using the indices
+train_sequences = [train_sequences[i] for i in train_indices]
+train_labels = [train_labels[i] for i in train_indices]
+
+test_sequences = [test_sequences[i] for i in test_indices]
+test_labels = [test_labels[i] for i in test_indices]
+
+# Display the number of rows in the train and test splits
+print("Number of rows in train split:", len(train_sequences))
+print("Number of rows in test split:", len(test_sequences))
+
+# print(train_sequences[:5])
+# Define batch size
+batch_size = 6
+
+# Tokenization and Embedding generation for train sequences
+train_embeddings = get_embeddings(train_sequences, batch_size, tokenizer, quantized_model)
+
+# Tokenization and Embedding generation for test sequences
+test_embeddings = get_embeddings(test_sequences, batch_size, tokenizer, quantized_model)
+
+# Convert tensors to numpy arrays
+X_train, y_train = train_embeddings, train_labels
+X_test, y_test = test_embeddings, test_labels
+
+# Training the classifier
+classifier = LogisticRegression(random_state=42)
+classifier.fit(X_train, y_train)
+
+# Predicting on the evaluation set
+y_pred = classifier.predict(X_test)
+
+print(X_test,y_pred)
+# Evaluating the classifier
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print("\nClassification Report:\n", classification_report(y_test, y_pred))
diff --git a/routes.py b/routes.py
@@ -0,0 +1,14 @@
+from flask import Blueprint, render_template, request
+from controller import classify_sequence, get_model_accuracy
+
+main_bp = Blueprint('main', __name__)
+
+@main_bp.route('/')
+def index():
+    return render_template('index.html')
+
+@main_bp.route('/classify', methods=['POST'])
+def classify():
+    sequence = request.form['sequence']
+    prediction, accuracy = classify_sequence(sequence)
+    return render_template('result.html', sequence=sequence, prediction=prediction, accuracy=accuracy)
diff --git a/static/styles.css b/static/styles.css
@@ -0,0 +1,43 @@
+body {
+    font-family: Arial, sans-serif;
+    margin: 40px;
+}
+
+h1 {
+    color: #333;
+}
+
+form {
+    margin-top: 20px;
+}
+
+label {
+    font-weight: bold;
+}
+
+textarea {
+    width: 100%;
+    font-family: monospace;
+}
+
+input[type="submit"] {
+    margin-top: 20px;
+    padding: 10px 20px;
+    font-size: 16px;
+}
+
+p {
+    font-size: 18px;
+}
+
+a {
+    margin-top: 20px;
+    display: inline-block;
+    font-size: 16px;
+    color: #0066cc;
+    text-decoration: none;
+}
+
+a:hover {
+    text-decoration: underline;
+}
diff --git a/templates/index.html b/templates/index.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>DNA Sequence Classification</title>
+    <link rel="stylesheet" href="/static/styles.css">
+</head>
+<body>
+    <h1>DNA Sequence Classification</h1>
+    <form action="/classify" method="post">
+        <label for="sequence">DNA Sequence:</label><br>
+        <textarea id="sequence" name="sequence" rows="4" cols="50"></textarea><br><br>
+        <input type="submit" value="Classify">
+    </form>
+</body>
+</html>
diff --git a/templates/result.html b/templates/result.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Classification Result</title>
+    <link rel="stylesheet" href="/static/styles.css">
+</head>
+<body>
+    <h1>Classification Result</h1>
+    <p><strong>Input Sequence:</strong></p>
+    <p>{{ sequence }}</p>
+    <p><strong>Prediction:</strong> {{ prediction }}</p>
+    <p><strong>Model Accuracy:</strong> {{ accuracy }}</p>
+    <a href="/">Go back</a>
+</body>
+</html>