-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
75 additions
and
104 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,4 @@ | ||
""" | ||
@Time : 2024/09/30 11:17:24 | ||
@Author : Daniel Persaud | ||
@Version : 1.0 | ||
@Contact : [email protected] | ||
@Desc : Hardness benchmarking, a maximization task on experimental hardness dataset. | ||
""" | ||
# Hardness benchmarking, a maximization task on experimental hardness dataset. | ||
|
||
from __future__ import annotations | ||
|
||
|
@@ -32,61 +26,55 @@ | |
ConvergenceExperimentSettings, | ||
) | ||
|
||
# Set up directory and load datasets | ||
HomeDir = os.getcwd() | ||
# Materials Project (MP) bulk modulus dataset | ||
dfMP = pd.read_csv(os.path.join(HomeDir, "benchmarks", "domains", "mp_bulkModulus_goodOverlap.csv"), index_col=0) | ||
# Experimental (Exp) hardness dataset | ||
dfExp = pd.read_csv(os.path.join(HomeDir, "benchmarks", "domains", "exp_hardness_goodOverlap.csv"), index_col=0) | ||
elementCols = dfExp.columns.to_list()[4:] | ||
|
||
# IMPORT AND PREPROCESS DATA------------------------------------------------------------------------------ | ||
strHomeDir = os.getcwd() | ||
dfMP = pd.read_csv( | ||
os.path.join(strHomeDir, "benchmarks", "domains", "mp_bulkModulus_goodOverlap.csv"), index_col=0 | ||
) | ||
dfExp = pd.read_csv( | ||
os.path.join(strHomeDir, "benchmarks", "domains", "exp_hardness_goodOverlap.csv"), index_col=0 | ||
) | ||
lstElementCols = dfExp.columns.to_list()[4:] | ||
|
||
# ----- FUTHER CLEAN THE DATA BASED ON THE EDA ----- | ||
# initialize an empty dataframe to store the integrated hardness values | ||
# Initialize an empty dataframe to store the integrated hardness values | ||
dfExp_integratedHardness = pd.DataFrame() | ||
|
||
# for each unique composition in dfExp, make a cubic spline interpolation of the hardness vs load curve | ||
for strComposition_temp in dfExp["composition"].unique(): | ||
dfComposition_temp = dfExp[dfExp["composition"] == strComposition_temp] | ||
# sort the data by load | ||
dfComposition_temp = dfComposition_temp.sort_values(by="load") | ||
dfComposition_temp = dfComposition_temp.drop_duplicates(subset="load") | ||
if len(dfComposition_temp) < 5: # continue to the next composition | ||
# For each unique composition in dfExp, make a cubic spline interpolation of the hardness vs load curve | ||
for composition_i in dfExp["composition"].unique(): | ||
composition_subset = dfExp[dfExp["composition"] == composition_i] | ||
# Sort the data by load | ||
composition_subset = composition_subset.sort_values(by="load") | ||
composition_subset = composition_subset.drop_duplicates(subset="load") | ||
if len(composition_subset) < 5: # Continue to the next composition | ||
continue | ||
|
||
# make a cubic spline interpolation of the hardness vs load curve | ||
spSpline_temp = sp.interpolate.CubicSpline(dfComposition_temp["load"], dfComposition_temp["hardness"]) | ||
# integrate the spline from the minimum load to the maximum load | ||
fltIntegral_temp = spSpline_temp.integrate(0.5, 5, extrapolate = True) | ||
# Perform cubic spline interpolation of the hardness vs load curve | ||
spline = sp.interpolate.CubicSpline(composition_subset["load"], composition_subset["hardness"]) | ||
# Integrate the spline from the minimum load to the maximum load | ||
integrated_value = spline.integrate(0.5, 5, extrapolate = True) | ||
|
||
# make a new dataframe with the lstElementCols from dfComposition_temp | ||
dfComposition_temp = dfComposition_temp[['strComposition', 'composition'] + lstElementCols] | ||
dfComposition_temp = dfComposition_temp.drop_duplicates(subset='composition') | ||
dfComposition_temp["integratedHardness"] = fltIntegral_temp | ||
# Make a new dataframe with the elementCols from composition_subset | ||
composition_summary = composition_subset[['strComposition', 'composition'] + elementCols] | ||
composition_summary = composition_summary.drop_duplicates(subset='composition') | ||
composition_summary["integratedHardness"] = integrated_value | ||
|
||
dfExp_integratedHardness = pd.concat([dfExp_integratedHardness, dfComposition_temp]) | ||
dfExp_integratedHardness = pd.concat([dfExp_integratedHardness, composition_summary]) | ||
|
||
# ----- TARGET FUNCTION (INTEGRATED HARDNESS) ----- | ||
# make a dataframe for the task function (integrated hardness) | ||
dfSearchSpace_target = dfExp_integratedHardness[lstElementCols] | ||
# ----- Target function (integrated hardness) ----- | ||
dfSearchSpace_target = dfExp_integratedHardness[elementCols] | ||
dfSearchSpace_target["Function"] = "targetFunction" | ||
|
||
# make a lookup table for the task function (integrate hardness) - add the 'integratedHardness' column from dfExp to dfSearchSpace_task | ||
# Make a lookup table for the task function (integrate hardness) - add the 'integratedHardness' column from dfExp to dfSearchSpace_task | ||
dfLookupTable_target = pd.concat([dfSearchSpace_target, dfExp_integratedHardness["integratedHardness"]], axis=1) | ||
dfLookupTable_target = dfLookupTable_target.rename(columns={"integratedHardness":"Target"}) | ||
|
||
# ----- SOURCE FUNCTION (VOIGT BULK MODULUS) ----- | ||
# make a dataframe for the source function (voigt bulk modulus) | ||
dfSearchSpace_source = dfMP[lstElementCols] | ||
# ----- Source function (voigt bulk modulus) ----- | ||
dfSearchSpace_source = dfMP[elementCols] | ||
dfSearchSpace_source["Function"] = "sourceFunction" | ||
|
||
# make a lookup table for the source function (voigt bulk modulus) - add the 'vrh' column from dfMP to dfSearchSpace_source | ||
# Make a lookup table for the source function (voigt bulk modulus) - add the 'vrh' column from dfMP to dfSearchSpace_source | ||
dfLookupTable_source = pd.concat([dfSearchSpace_source, dfMP["vrh"]], axis=1) | ||
dfLookupTable_source = dfLookupTable_source.rename(columns={"vrh": "Target"}) | ||
|
||
# concatenate the two dataframes | ||
# Combine the search space | ||
dfSearchSpace = pd.concat([dfSearchSpace_target, dfSearchSpace_source]) | ||
|
||
def hardness(settings: ConvergenceExperimentSettings) -> DataFrame: | ||
|
@@ -107,52 +95,46 @@ def hardness(settings: ConvergenceExperimentSettings) -> DataFrame: | |
Objective: Maximization | ||
""" | ||
|
||
lstParameters_bb = [] | ||
lstParameters_bb_noTask = [] | ||
parameters = [] | ||
parameters_noTask = [] | ||
|
||
# for each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter | ||
for strCol_temp in dfSearchSpace.columns[:-1]: | ||
bbParameter_temp = NumericalDiscreteParameter( | ||
name=strCol_temp, | ||
values=np.unique(dfSearchSpace[strCol_temp]), | ||
# For each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter | ||
for column in dfSearchSpace.columns[:-1]: | ||
parameter_i = NumericalDiscreteParameter( | ||
name=column, | ||
values=np.unique(dfSearchSpace[column]), | ||
tolerance=0.0, | ||
) | ||
# append the parameter to the list of parameters | ||
lstParameters_bb.append(bbParameter_temp) | ||
lstParameters_bb_noTask.append(bbParameter_temp) | ||
parameters.append(parameter_i) | ||
parameters_noTask.append(parameter_i) | ||
|
||
# create a TaskParameter | ||
bbTaskParameter = TaskParameter( | ||
# Create TaskParameter | ||
taskParameter = TaskParameter( | ||
name="Function", | ||
values=["targetFunction", "sourceFunction"], | ||
active_values=["targetFunction"], | ||
) | ||
parameters.append(taskParameter) | ||
|
||
# append the taskParameter to the list of parameters | ||
lstParameters_bb.append(bbTaskParameter) | ||
|
||
search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters_bb) | ||
SearchSpace_noTask = SearchSpace.from_dataframe(dfSearchSpace_target[lstElementCols], parameters=lstParameters_bb_noTask) | ||
search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=parameters) | ||
SearchSpace_noTask = SearchSpace.from_dataframe(dfSearchSpace_target[elementCols], parameters=parameters_noTask) | ||
|
||
objective = NumericalTarget(name="Target", mode=TargetMode.MAX).to_objective() | ||
|
||
scenarios: dict[str, Campaign] = { | ||
"Random Recommender": Campaign( | ||
searchspace=SearchSpace.from_dataframe( | ||
dfSearchSpace_target[lstElementCols], | ||
parameters=lstParameters_bb_noTask | ||
dfSearchSpace_target[elementCols], | ||
parameters=parameters_noTask | ||
), | ||
recommender=RandomRecommender(), | ||
objective=objective, | ||
), | ||
"Default Recommender": Campaign( | ||
searchspace=SearchSpace.from_dataframe( | ||
dfSearchSpace, | ||
parameters=lstParameters_bb, | ||
), | ||
searchspace=search_space, | ||
objective=objective, | ||
), | ||
"noTask_bb": Campaign( | ||
"No TaskParameter": Campaign( | ||
searchspace=SearchSpace_noTask, | ||
objective=objective, | ||
), | ||
|
@@ -186,44 +168,41 @@ def hardness_transfer_learning(settings: ConvergenceExperimentSettings) -> DataF | |
Objective: Maximization | ||
""" | ||
|
||
lstParameters_bb = [] | ||
lstParameters_bb_noTask = [] | ||
parameters = [] | ||
parameters_noTask = [] | ||
|
||
# for each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter | ||
for strCol_temp in dfSearchSpace.columns[:-1]: | ||
bbParameter_temp = NumericalDiscreteParameter( | ||
name=strCol_temp, | ||
values=np.unique(dfSearchSpace[strCol_temp]), | ||
# For each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter | ||
for column in dfSearchSpace.columns[:-1]: | ||
parameter_i = NumericalDiscreteParameter( | ||
name=column, | ||
values=np.unique(dfSearchSpace[column]), | ||
tolerance=0.0, | ||
) | ||
# append the parameter to the list of parameters | ||
lstParameters_bb.append(bbParameter_temp) | ||
lstParameters_bb_noTask.append(bbParameter_temp) | ||
parameters.append(parameter_i) | ||
parameters_noTask.append(parameter_i) | ||
|
||
# create a TaskParameter | ||
bbTaskParameter = TaskParameter( | ||
# Create TaskParameter | ||
taskParameter = TaskParameter( | ||
name="Function", | ||
values=["targetFunction", "sourceFunction"], | ||
active_values=["targetFunction"], | ||
) | ||
|
||
# append the taskParameter to the list of parameters | ||
lstParameters_bb.append(bbTaskParameter) | ||
parameters.append(taskParameter) | ||
|
||
objective = NumericalTarget(name="Target", mode=TargetMode.MAX).to_objective() | ||
|
||
search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=parameters) | ||
campaign = Campaign(searchspace=search_space, objective=objective) | ||
|
||
# Use diff init data size ---------------------- | ||
# Create a list of dataframes with n samples from dfLookupTable_source to use as initial data | ||
for n in (2, 4, 6, 30): | ||
bbSearchSpace = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters_bb) | ||
bbCampaign_temp = Campaign( | ||
searchspace=bbSearchSpace, | ||
objective=objective) | ||
# create a list of dataframes with n samples from dfLookupTable_source to use as initial data | ||
lstInitialData_temp = [dfLookupTable_source.sample(n) for _ in range(settings.n_mc_iterations)] | ||
initialData_i = [dfLookupTable_source.sample(n) for _ in range(settings.n_mc_iterations)] | ||
|
||
return simulate_scenarios( | ||
{f"{n} Initial Data": bbCampaign_temp}, | ||
{f"{n} Initial Data": campaign}, | ||
dfLookupTable_target, | ||
initial_data=lstInitialData_temp, | ||
initial_data=initialData_i, | ||
batch_size=settings.batch_size, | ||
n_doe_iterations=settings.n_doe_iterations, | ||
impute_mode="error", | ||
|
@@ -252,18 +231,14 @@ def hardness_transfer_learning(settings: ConvergenceExperimentSettings) -> DataF | |
|
||
if __name__ == "__main__": | ||
|
||
# describe the benchmark task | ||
# Describe the benchmark task | ||
print("Hardness benchmark is a maximization task on experimental hardness dataset. ") | ||
print("The dataset is downselect to 94 composition with more than 5 hardness values. ") | ||
print("The hardness values are integrated using cubic spline interpolation, and the task is to maximize the integrated hardness. ") | ||
print("") | ||
print("Hardness benchmark compares across random, default, and no task parameter set up. ") | ||
print("") | ||
print("The hardness values are integrated using cubic spline interpolation, and the task is to maximize the integrated hardness. \n") | ||
print("Hardness benchmark compares across random, default, and no task parameter set up. \n") | ||
print("Hardness transfer learning benchmark compares across different initialized data sizes. ") | ||
|
||
|
||
# Visualize the Hardness value histogram | ||
# initialize a subplot with 1 row and 1 column | ||
# Visualize the Hardness value histogram | ||
fig, ax = plt.subplots( | ||
1, 1, | ||
figsize=(8, 5), | ||
|
@@ -272,13 +247,9 @@ def hardness_transfer_learning(settings: ConvergenceExperimentSettings) -> DataF | |
constrained_layout = True | ||
) | ||
|
||
# plot a histogram of the hardness values | ||
# Plot a histogram of the hardness values | ||
ax.hist(dfExp["hardness"], bins=20) | ||
|
||
# add a title, x-aixs label, and y-axis label | ||
ax.set_xlabel("Hardness") | ||
ax.set_ylabel("Frequency") | ||
ax.set_title("Integrated Hardness Distribution") | ||
|
||
# add a grid | ||
ax.grid() |