-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
140 lines (111 loc) · 4.76 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#from msilib import sequence
from typing import List
from numpy import diff
from pathlib import Path
import os
from pyeed.core import ProteinInfo
def find_family(
blast_results: List[ProteinInfo],
word: str
) -> Tuple[List, List]:
"""
Separates proteins into two lists based on whether their name contains the given word or not.
Args:
blast_results (List[ProteinInfo]): A list of ProteinInfo objects obtained from a blast search.
word (str): The word to search for in the protein names.
Returns:
Tuple[List, List]: A tuple containing two lists: found_family and different.
found_family: A list of source IDs of proteins whose name contains the given word.
different: A list of source IDs of proteins whose name does not contain the given word.
"""
found_family = []
different = []
nameless = []
for protein in blast_results:
if protein.name:
if word in protein.name:
found_family.append(protein.source_id)
else:
different.append(protein.source_id)
elif isinstance(protein, dict) and 'source_id' in protein:
nameless.append(protein['source_id'])
else:
print("Protein is not a dictionary.")
return found_family, different
def categorize_organism(
blast_results: List[ProteinInfo],
category: str
) -> Dict[str, List[str]]:
"""
Categorizes proteins based on their organism information.
Args:
blast_results (List[ProteinInfo]): A list of blast results, where each result is an instance of the ProteinInfo class.
category (str): The category to use for categorizing the proteins. Can be "kingdom", "domain", or "phylum".
Returns:
Dict[str, List[str]]: A dictionary where the keys are the categories (kingdom, domain, or phylum) and the values are lists of protein source IDs belonging to that category.
"""
proteins = {}
length = len(blast_results)
if category == "kingdom":
category_key = "organism.kingdom"
print("Kingdoms:")
elif category == "domain":
category_key = "organism.domain"
print("Domains:")
elif category == "phylum":
category_key = "organism.phylum"
print("Phylum:")
else:
raise NameError("The given category is not valid. Try kingdom, domain, or phylum")
for protein in blast_results:
category_value = getattr(protein, category_key)
if category_value not in proteins:
proteins[category_value] = [protein.source_id]
else:
proteins[category_value].append(protein.source_id)
for key in proteins:
percentage = round(len(proteins[key]) / length * 100, 4)
print(f"Percentage of {key}: {percentage}%")
return proteins
def categorize_len(blast_results: List[ProteinInfo]) -> Tuple[List, List, List]:
"""
Categorizes a list of ProteinInfo objects based on the length of their sequences into three categories: small, middle, and long.
Args:
- blast_results (List[ProteinInfo]): A list of ProteinInfo objects representing the results of a protein blast search.
Returns:
- Tuple[List, List, List]: A tuple containing three lists: small, middle, and long.
- small (List): A list of source IDs of proteins with sequence length less than 200.
- middle (List): A list of source IDs of proteins with sequence length between 200 and 299.
- long (List): A list of source IDs of proteins with sequence length 300 or more.
"""
small = []
middle = []
long = []
for protein in blast_results:
sequence_length = len(protein.sequence)
if sequence_length < 200:
small.append(protein.source_id)
elif sequence_length < 300:
middle.append(protein.source_id)
else:
long.append(protein.source_id)
return small, middle, long
def remove_duplicates():
"""
The `remove_duplicates` function removes duplicate files from a directory based on the `source_id` attribute of `ProteinInfo` objects.
"""
proteins = {}
for i in range(7):
for path in Path("./blast_results/blast_results_" + str(i+1)).rglob("*.json"):
with open(str(path.absolute())) as f:
protein = ProteinInfo.from_json(f)
if not protein.source_id in proteins:
proteins[protein.source_id] = [path]
else:
proteins[protein.source_id].extend([path])
for file_path in proteins.values():
if len(file_path) > 1:
print(f"Duplicate files found:\n{path}\n")
for path in file_path[1:]:
os.remove(path)
print(f"{path} has been deleted.\n")