OpenProteinAI · timt51 · Apr 25, 2024 · Apr 25, 2024
diff --git a/config.json b/config.json
diff --git a/proteingym/baselines/PoET/LICENSE b/proteingym/baselines/PoET/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 OpenProteinAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/proteingym/baselines/PoET/Makefile b/proteingym/baselines/PoET/Makefile
@@ -0,0 +1,30 @@
+DOWNLOAD_DIR := ./data
+
+create_lock:
+	cd environments/poet && conda-lock --kind explicit -f environment.yml -p linux-64
+
+create_stable_env:
+	conda create --name poet --file environments/poet/conda-linux-64.lock
+	conda run -n poet pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.1.1/flash_attn-2.1.1+cu118torch2.0cxx11abiFALSE-cp39-cp39-linux_x86_64.whl
+	conda run -n poet pip install -e .
+
+create_conda_env: create_stable_env
+
+update_conda_env: create_lock create_conda_env
+
+download_model:
+	mkdir -p $(DOWNLOAD_DIR)
+	wget -c https://zenodo.org/records/11062361/files/poet.ckpt?download=1 -O $(DOWNLOAD_DIR)/poet.ckpt
+	wget -c https://zenodo.org/records/11062361/files/LICENSE?download=1 -O $(DOWNLOAD_DIR)/LICENSE
+
+download_and_extract_msas:
+	mkdir -p $(DOWNLOAD_DIR)
+	wget -c https://zenodo.org/records/11062361/files/proteingymv1_colabfold2302_msas.tar?download=1 -O $(DOWNLOAD_DIR)/proteingymv1_colabfold2302_msas.tar
+	cd $(DOWNLOAD_DIR) && tar -xf proteingymv1_colabfold2302_msas.tar && rm proteingymv1_colabfold2302_msas.tar
+
+download_scores:
+	mkdir -p $(DOWNLOAD_DIR)
+	wget -c https://zenodo.org/records/11062361/files/proteingymv1_scores.tar.gz?download=1 -O $(DOWNLOAD_DIR)/proteingymv1_scores.tar.gz
+	cd $(DOWNLOAD_DIR) && tar -xzf proteingymv1_scores.tar.gz && rm proteingymv1_scores.tar.gz
+
+download: download_model download_and_extract_msas download_scores
diff --git a/proteingym/baselines/PoET/README.md b/proteingym/baselines/PoET/README.md
@@ -0,0 +1,33 @@
+# PoET: A generative model of protein families as sequences-of-sequences
+
+Contains the inference code for ["PoET: A generative model of protein families as sequences-of-sequences"](https://arxiv.org/abs/2306.06156), a state-of-the-art protein language model for variant effect prediction and conditional sequence generation.
+
+## Environment Setup
+
+1. Have `conda` installed
+1. Run `make create_conda_env`. This will create a conda environment named `poet`.
+1. Run `make download_model` to download the model (~400MB). The model will be located at `data/poet.ckpt`. Please note the [license](#License).
+
+## Citation
+
+You may cite the paper as
+
+```
+@inproceedings{NEURIPS2023_f4366126,
+ author = {Truong Jr, Timothy and Bepler, Tristan},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {A. Oh and T. Neumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
+ pages = {77379--77415},
+ publisher = {Curran Associates, Inc.},
+ title = {PoET: A generative model of protein families as sequences-of-sequences},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/f4366126eba252699b280e8f93c0ab2f-Paper-Conference.pdf},
+ volume = {36},
+ year = {2023}
+}
+```
+
+## License
+
+This source code is licensed under the MIT license found in the LICENSE file in the root directory of this source tree.
+
+The [PoET model weights](https://zenodo.org/records/10061322) (DOI: `10.5281/zenodo.10061322`) are available under the [CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) license for academic use only. The license can also be found in the LICENSE file provided with the model weights. For commercial use, please reach out to us at [email protected] about licensing. Copyright (c) NE47 Bio, Inc. All Rights Reserved.
diff --git a/proteingym/baselines/PoET/environments/poet/conda-linux-64.lock b/proteingym/baselines/PoET/environments/poet/conda-linux-64.lock
diff --git a/proteingym/baselines/PoET/environments/poet/environment.yml b/proteingym/baselines/PoET/environments/poet/environment.yml
@@ -0,0 +1,34 @@
+channels:
+  - pytorch
+  - nvidia
+  - bioconda
+  - conda-forge
+dependencies:
+  - python=3.9
+  - pytorch~=2.0.1
+  - pytorch-cuda~=11.8.0
+  - mkl==2024.0.0
+  - lightning~=2.0.7
+  - torchmetrics~=1.0.3
+  - transformers~=4.33
+  - tensorboard
+  - numba
+  - numpy
+  - scipy
+  - pandas
+  - scikit-learn
+  - matplotlib
+  - seaborn
+  - tqdm
+  - pydantic<2
+  - pyarrow
+  - ipykernel
+  - zarr
+  - fsspec
+  - s3fs
+  - boto3
+  - mmseqs2
+  - hhsuite
+  - biopython
+  - pyzstd
+  - pip
diff --git a/proteingym/baselines/PoET/poet/__init__.py b/proteingym/baselines/PoET/poet/__init__.py
diff --git a/proteingym/baselines/PoET/poet/alphabets.py b/proteingym/baselines/PoET/poet/alphabets.py
@@ -0,0 +1,133 @@
+"""
+Copyright (C) Tristan Bepler - All Rights Reserved
+Author: Tristan Bepler <[email protected]>
+"""
+
+from __future__ import division, print_function
+
+import numpy as np
+
+
+class Alphabet:
+    def __init__(self, chars, encoding=None, mask=False, missing=255):
+        self.chars = np.frombuffer(chars, dtype=np.uint8)
+        self.encoding = np.zeros(256, dtype=np.uint8) + missing
+        if encoding is None:
+            self.encoding[self.chars] = np.arange(len(self.chars))
+            self.size = len(self.chars)
+        else:
+            self.encoding[self.chars] = encoding
+            self.size = encoding.max() + 1
+        self.mask = mask
+        if mask:
+            self.size -= 1
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, i):
+        return chr(self.chars[i])
+
+    def encode(self, x):
+        """encode a byte string into alphabet indices"""
+        x = np.frombuffer(x, dtype=np.uint8)
+        return self.encoding[x]
+
+    def decode(self, x):
+        """decode index array, x, to byte string of this alphabet"""
+        string = self.chars[x]
+        return string.tobytes()
+
+    def unpack(self, h, k):
+        """unpack integer h into array of this alphabet with length k"""
+        n = self.size
+        kmer = np.zeros(k, dtype=np.uint8)
+        for i in reversed(range(k)):
+            c = h % n
+            kmer[i] = c
+            h = h // n
+        return kmer
+
+    def get_kmer(self, h, k):
+        """retrieve byte string of length k decoded from integer h"""
+        kmer = self.unpack(h, k)
+        return self.decode(kmer)
+
+
+DNA = Alphabet(b"ACGT")
+
+
+class Uniprot21(Alphabet):
+    def __init__(
+        self,
+        mask=False,
+        include_gap=False,
+        include_startstop=False,
+        distinct_startstop=False,
+    ):
+        chars = b"ARNDCQEGHILKMFPSTWYV"
+        gap_token = start_token = stop_token = -1
+        if include_gap:
+            chars = chars + b"-"
+            gap_token = len(chars) - 1
+        if include_startstop:
+            chars = chars + b"*"
+            start_token = stop_token = len(chars) - 1
+        if distinct_startstop:
+            chars = chars + b"$"
+            stop_token = len(chars) - 1
+        # add the synonym tokens
+        mask_token = len(chars)
+        chars = chars + b"XOUBZ"
+
+        encoding = np.arange(len(chars))
+        encoding[mask_token + 1 :] = [
+            11,
+            4,
+            mask_token,
+            mask_token,
+        ]  # encode 'OUBZ' as synonyms
+        missing = mask_token
+
+        super(Uniprot21, self).__init__(
+            chars, encoding=encoding, mask=mask, missing=missing
+        )
+
+        self.gap_token = gap_token
+        self.start_token = start_token
+        self.stop_token = stop_token
+        self.mask_token = mask_token
+
+
+class SDM12(Alphabet):
+    """
+    A D KER N TSQ YF LIVM C W H G P
+
+    See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2732308/#B33
+    "Reduced amino acid alphabets exhibit an improved sensitivity and selectivity in fold assignment"
+    Peterson et al. 2009. Bioinformatics.
+    """
+
+    def __init__(self, mask=False):
+        chars = alphabet = b"ADKNTYLCWHGPXERSQFIVMOUBZ"
+        groups = [
+            b"A",
+            b"D",
+            b"KERO",
+            b"N",
+            b"TSQ",
+            b"YF",
+            b"LIVM",
+            b"CU",
+            b"W",
+            b"H",
+            b"G",
+            b"P",
+            b"XBZ",
+        ]
+        groups = {c: i for i in range(len(groups)) for c in groups[i]}
+        encoding = np.array([groups[c] for c in chars])
+        super(SDM12, self).__init__(chars, encoding=encoding, mask=mask)
+
+
+SecStr8 = Alphabet(b"HBEGITS ")
diff --git a/proteingym/baselines/PoET/poet/fasta.py b/proteingym/baselines/PoET/poet/fasta.py
@@ -0,0 +1,51 @@
+"""
+Copyright (C) Tristan Bepler - All Rights Reserved
+Author: Tristan Bepler <[email protected]>
+"""
+
+from __future__ import division, print_function
+
+
+def parse_stream(f, comment=b"#", upper=True):
+    name = None
+    sequence = []
+    for line in f:
+        if line.startswith(comment):
+            continue
+        line = line.strip()
+        if line.startswith(b">"):
+            if name is not None:
+                yield name, b"".join(sequence)
+            name = line[1:]
+            sequence = []
+        else:
+            if upper:
+                sequence.append(line.upper())
+            else:
+                sequence.append(line)
+    if name is not None:
+        yield name, b"".join(sequence)
+
+
+def parse(f, comment=b"#"):
+    names = []
+    sequences = []
+    name = None
+    sequence = []
+    for line in f:
+        if line.startswith(comment):
+            continue
+        line = line.strip()
+        if line.startswith(b">"):
+            if name is not None:
+                names.append(name)
+                sequences.append(b"".join(sequence))
+            name = line[1:]
+            sequence = []
+        else:
+            sequence.append(line.upper())
+    if name is not None:
+        names.append(name)
+        sequences.append(b"".join(sequence))
+
+    return names, sequences
diff --git a/proteingym/baselines/PoET/poet/models/__init__.py b/proteingym/baselines/PoET/poet/models/__init__.py
diff --git a/proteingym/baselines/PoET/poet/models/modules/__init__.py b/proteingym/baselines/PoET/poet/models/modules/__init__.py
@@ -0,0 +1,6 @@
+from .activation import *
+from .attention import *
+from .attention_flash import *
+from .embedding import *
+from .transformer import *
+from .transformer_rotary import *
diff --git a/proteingym/baselines/PoET/poet/models/modules/activation.py b/proteingym/baselines/PoET/poet/models/modules/activation.py
@@ -0,0 +1,7 @@
+import torch
+import torch.nn.functional as F
+
+gelu = F.gelu
+
+# def gelu(x):
+#    return x*0.5*(1.0 + torch.erf(x/1.41421356237))