-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_duplicate_compositions.py
79 lines (69 loc) · 2.14 KB
/
remove_duplicate_compositions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Calculated reduced formulas and remove duplicate compositions.
Might take about an hour to run.
"""
import numpy as np
import pandas as pd
from tqdm import tqdm
from pymatgen.core import Composition
df = pd.read_csv("all-formula.csv")
PARSE = "unparsable"
AVAIL = "unavailable"
# https://stackoverflow.com/questions/46342492/use-dictionary-to-replace-a-string-within-a-string-in-pandas-columns
blank = ""
mapper = {
"-": blank,
"(alpha)": blank,
"beta": blank,
",": blank,
"(amorphous)": blank,
"/": blank,
"(anatase)": blank,
"alpha": blank,
"(hex)": blank,
"beta": blank,
}
bad = ["X", "(6H)"]
# df["formula"] = df["formula"].replace(mapper, regex=True) # slow
reduced_formulas = []
bad_formulas = []
factors = []
for i, formula in enumerate(tqdm(df["formula"])):
if (
isinstance(formula, str)
and formula != "unavailable"
and not np.any([s in formula for s in bad])
):
for key in mapper.keys():
formula = formula.replace(key, mapper[key])
try:
reduced_formula, factor = Composition(
formula
).get_reduced_formula_and_factor()
reduced_formulas.append(reduced_formula)
factors.append(factor)
except Exception:
print(formula)
reduced_formulas.append("bad_formula")
bad_formulas.append(formula)
factors.append(0)
pass
else:
reduced_formulas.append("unavailable")
factors.append(0)
df["reduced_formula"] = reduced_formulas
df["factor"] = factors
# keep track of repeated formula calc_id-s and track counts
uniq_df = (
df.reset_index()
.groupby(by="reduced_formula")
.agg({"calc_id": lambda x: tuple(x)})
.reset_index()
)
uniq_df["count"] = uniq_df["calc_id"].apply(len)
# remove "unavailable" formula and make `calc_id`-s the index
uniq_df = uniq_df[uniq_df["reduced_formula"] != AVAIL]
uniq_df = uniq_df[uniq_df["reduced_formula"] != PARSE]
uniq_df = uniq_df.set_index("calc_id")
uniq_df.to_csv("unique-reduced-formula.csv")
bad_df = pd.DataFrame(dict(bad_formula=bad_formulas))
bad_df.to_csv("bad-formula.csv")