Add files via upload

emdgroup · Feb 28, 2025 · 8b17dbb · 8b17dbb
1 parent c2eb1fa
commit 8b17dbb
Showing 1 changed file with 75 additions and 104 deletions.
diff --git a/benchmarks/domains/Hardness.py b/benchmarks/domains/Hardness.py
@@ -1,10 +1,4 @@
-"""
-@Time    :   2024/09/30 11:17:24
-@Author  :   Daniel Persaud
-@Version :   1.0
-@Contact :   [email protected]
-@Desc    :   Hardness benchmarking, a maximization task on experimental hardness dataset. 
-"""
+# Hardness benchmarking, a maximization task on experimental hardness dataset. 
 
 from __future__ import annotations
 
@@ -32,61 +26,55 @@
     ConvergenceExperimentSettings,
 )
 
+# Set up directory and load datasets
+HomeDir = os.getcwd()
+# Materials Project (MP) bulk modulus dataset
+dfMP = pd.read_csv(os.path.join(HomeDir, "benchmarks", "domains", "mp_bulkModulus_goodOverlap.csv"), index_col=0)
+# Experimental (Exp) hardness dataset
+dfExp = pd.read_csv(os.path.join(HomeDir, "benchmarks", "domains", "exp_hardness_goodOverlap.csv"), index_col=0)
+elementCols = dfExp.columns.to_list()[4:]
 
-# IMPORT AND PREPROCESS DATA------------------------------------------------------------------------------
-strHomeDir = os.getcwd()
-dfMP = pd.read_csv(
-    os.path.join(strHomeDir, "benchmarks", "domains", "mp_bulkModulus_goodOverlap.csv"), index_col=0
-)
-dfExp = pd.read_csv(
-    os.path.join(strHomeDir, "benchmarks", "domains", "exp_hardness_goodOverlap.csv"), index_col=0
-)
-lstElementCols = dfExp.columns.to_list()[4:]
-
-# ----- FUTHER CLEAN THE DATA BASED ON THE EDA -----
-# initialize an empty dataframe to store the integrated hardness values
+# Initialize an empty dataframe to store the integrated hardness values
 dfExp_integratedHardness = pd.DataFrame()
 
-# for each unique composition in dfExp, make a cubic spline interpolation of the hardness vs load curve
-for strComposition_temp in dfExp["composition"].unique():
-    dfComposition_temp = dfExp[dfExp["composition"] == strComposition_temp]
-    # sort the data by load
-    dfComposition_temp = dfComposition_temp.sort_values(by="load")
-    dfComposition_temp = dfComposition_temp.drop_duplicates(subset="load")
-    if len(dfComposition_temp) < 5:     # continue to the next composition
+# For each unique composition in dfExp, make a cubic spline interpolation of the hardness vs load curve
+for composition_i in dfExp["composition"].unique():
+    composition_subset = dfExp[dfExp["composition"] == composition_i]
+    # Sort the data by load
+    composition_subset = composition_subset.sort_values(by="load")
+    composition_subset = composition_subset.drop_duplicates(subset="load")
+    if len(composition_subset) < 5:     # Continue to the next composition
         continue
 
-    # make a cubic spline interpolation of the hardness vs load curve
-    spSpline_temp = sp.interpolate.CubicSpline(dfComposition_temp["load"], dfComposition_temp["hardness"])
-    # integrate the spline from the minimum load to the maximum load
-    fltIntegral_temp = spSpline_temp.integrate(0.5, 5, extrapolate = True)
+    # Perform cubic spline interpolation of the hardness vs load curve
+    spline = sp.interpolate.CubicSpline(composition_subset["load"], composition_subset["hardness"])
+    # Integrate the spline from the minimum load to the maximum load
+    integrated_value = spline.integrate(0.5, 5, extrapolate = True)
 
-    # make a new dataframe with the lstElementCols from dfComposition_temp
-    dfComposition_temp = dfComposition_temp[['strComposition', 'composition'] + lstElementCols]
-    dfComposition_temp = dfComposition_temp.drop_duplicates(subset='composition')
-    dfComposition_temp["integratedHardness"] = fltIntegral_temp
+    # Make a new dataframe with the elementCols from composition_subset
+    composition_summary = composition_subset[['strComposition', 'composition'] + elementCols]
+    composition_summary = composition_summary.drop_duplicates(subset='composition')
+    composition_summary["integratedHardness"] = integrated_value
 
-    dfExp_integratedHardness = pd.concat([dfExp_integratedHardness, dfComposition_temp])
+    dfExp_integratedHardness = pd.concat([dfExp_integratedHardness, composition_summary])
 
-# ----- TARGET FUNCTION (INTEGRATED HARDNESS) -----
-# make a dataframe for the task function (integrated hardness)
-dfSearchSpace_target = dfExp_integratedHardness[lstElementCols]
+# ----- Target function (integrated hardness) -----
+dfSearchSpace_target = dfExp_integratedHardness[elementCols]
 dfSearchSpace_target["Function"] = "targetFunction"
 
-# make a lookup table for the task function (integrate hardness) - add the 'integratedHardness' column from dfExp to dfSearchSpace_task
+# Make a lookup table for the task function (integrate hardness) - add the 'integratedHardness' column from dfExp to dfSearchSpace_task
 dfLookupTable_target = pd.concat([dfSearchSpace_target, dfExp_integratedHardness["integratedHardness"]], axis=1)
 dfLookupTable_target = dfLookupTable_target.rename(columns={"integratedHardness":"Target"})
 
-# ----- SOURCE FUNCTION (VOIGT BULK MODULUS) -----
-# make a dataframe for the source function (voigt bulk modulus)
-dfSearchSpace_source = dfMP[lstElementCols]
+# ----- Source function (voigt bulk modulus) -----
+dfSearchSpace_source = dfMP[elementCols]
 dfSearchSpace_source["Function"] = "sourceFunction"
 
-# make a lookup table for the source function (voigt bulk modulus) - add the 'vrh' column from dfMP to dfSearchSpace_source
+# Make a lookup table for the source function (voigt bulk modulus) - add the 'vrh' column from dfMP to dfSearchSpace_source
 dfLookupTable_source = pd.concat([dfSearchSpace_source, dfMP["vrh"]], axis=1)
 dfLookupTable_source = dfLookupTable_source.rename(columns={"vrh": "Target"})
 
-# concatenate the two dataframes
+# Combine the search space
 dfSearchSpace = pd.concat([dfSearchSpace_target, dfSearchSpace_source])
 
 def hardness(settings: ConvergenceExperimentSettings) -> DataFrame:
@@ -107,52 +95,46 @@ def hardness(settings: ConvergenceExperimentSettings) -> DataFrame:
     Objective: Maximization
     """
 
-    lstParameters_bb = []
-    lstParameters_bb_noTask = []
+    parameters = []
+    parameters_noTask = []
 
-    # for each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter
-    for strCol_temp in dfSearchSpace.columns[:-1]:
-        bbParameter_temp = NumericalDiscreteParameter(
-            name=strCol_temp,
-            values=np.unique(dfSearchSpace[strCol_temp]),
+    # For each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter
+    for column in dfSearchSpace.columns[:-1]:
+        parameter_i = NumericalDiscreteParameter(
+            name=column,
+            values=np.unique(dfSearchSpace[column]),
             tolerance=0.0,
         )
-        # append the parameter to the list of parameters
-        lstParameters_bb.append(bbParameter_temp)
-        lstParameters_bb_noTask.append(bbParameter_temp)
+        parameters.append(parameter_i)
+        parameters_noTask.append(parameter_i)
 
-    # create a TaskParameter
-    bbTaskParameter = TaskParameter(
+    # Create TaskParameter
+    taskParameter = TaskParameter(
         name="Function",
         values=["targetFunction", "sourceFunction"],
         active_values=["targetFunction"],
     )   
+    parameters.append(taskParameter)
 
-    # append the taskParameter to the list of parameters
-    lstParameters_bb.append(bbTaskParameter)
-
-    search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters_bb)
-    SearchSpace_noTask = SearchSpace.from_dataframe(dfSearchSpace_target[lstElementCols], parameters=lstParameters_bb_noTask)
+    search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=parameters)
+    SearchSpace_noTask = SearchSpace.from_dataframe(dfSearchSpace_target[elementCols], parameters=parameters_noTask)
 
     objective = NumericalTarget(name="Target", mode=TargetMode.MAX).to_objective()
 
     scenarios: dict[str, Campaign] = {
         "Random Recommender": Campaign(
             searchspace=SearchSpace.from_dataframe(
-                dfSearchSpace_target[lstElementCols],
-                parameters=lstParameters_bb_noTask
+                dfSearchSpace_target[elementCols],
+                parameters=parameters_noTask
             ),
             recommender=RandomRecommender(),
             objective=objective,
         ),
         "Default Recommender": Campaign(
-            searchspace=SearchSpace.from_dataframe(
-                dfSearchSpace, 
-                parameters=lstParameters_bb,
-            ),
+            searchspace=search_space,
             objective=objective,
         ),
-        "noTask_bb": Campaign(
+        "No TaskParameter": Campaign(
             searchspace=SearchSpace_noTask,
             objective=objective,
         ),
@@ -186,44 +168,41 @@ def hardness_transfer_learning(settings: ConvergenceExperimentSettings) -> DataF
     Objective: Maximization
     """
 
-    lstParameters_bb = []
-    lstParameters_bb_noTask = []
+    parameters = []
+    parameters_noTask = []
 
-    # for each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter
-    for strCol_temp in dfSearchSpace.columns[:-1]:
-        bbParameter_temp = NumericalDiscreteParameter(
-            name=strCol_temp,
-            values=np.unique(dfSearchSpace[strCol_temp]),
+    # For each column in dfSearchSpace except the last one, create a NumericalDiscreteParameter
+    for column in dfSearchSpace.columns[:-1]:
+        parameter_i = NumericalDiscreteParameter(
+            name=column,
+            values=np.unique(dfSearchSpace[column]),
             tolerance=0.0,
         )
-        # append the parameter to the list of parameters
-        lstParameters_bb.append(bbParameter_temp)
-        lstParameters_bb_noTask.append(bbParameter_temp)
+        parameters.append(parameter_i)
+        parameters_noTask.append(parameter_i)
 
-    # create a TaskParameter
-    bbTaskParameter = TaskParameter(
+    # Create TaskParameter
+    taskParameter = TaskParameter(
         name="Function",
         values=["targetFunction", "sourceFunction"],
         active_values=["targetFunction"],
     )   
-
-    # append the taskParameter to the list of parameters
-    lstParameters_bb.append(bbTaskParameter)
+    parameters.append(taskParameter)
 
     objective = NumericalTarget(name="Target", mode=TargetMode.MAX).to_objective()
 
+    search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=parameters)
+    campaign = Campaign(searchspace=search_space, objective=objective)
+
+    # Use diff init data size ----------------------
+    # Create a list of dataframes with n samples from dfLookupTable_source to use as initial data
     for n in (2, 4, 6, 30):
-        bbSearchSpace = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters_bb)
-        bbCampaign_temp = Campaign(
-            searchspace=bbSearchSpace,
-            objective=objective)
-        # create a list of dataframes with n samples from dfLookupTable_source to use as initial data
-        lstInitialData_temp = [dfLookupTable_source.sample(n) for _ in range(settings.n_mc_iterations)]
+        initialData_i = [dfLookupTable_source.sample(n) for _ in range(settings.n_mc_iterations)]
 
     return simulate_scenarios(
-        {f"{n} Initial Data": bbCampaign_temp},
+        {f"{n} Initial Data": campaign},
         dfLookupTable_target,
-        initial_data=lstInitialData_temp, 
+        initial_data=initialData_i, 
         batch_size=settings.batch_size,
         n_doe_iterations=settings.n_doe_iterations,
         impute_mode="error",
@@ -252,18 +231,14 @@ def hardness_transfer_learning(settings: ConvergenceExperimentSettings) -> DataF
 
 if __name__ == "__main__":
 
-    # describe the benchmark task 
+    # Describe the benchmark task 
     print("Hardness benchmark is a maximization task on experimental hardness dataset. ")
     print("The dataset is downselect to 94 composition with more than 5 hardness values. ")
-    print("The hardness values are integrated using cubic spline interpolation, and the task is to maximize the integrated hardness. ")
-    print("")
-    print("Hardness benchmark compares across random, default, and no task parameter set up. ")
-    print("")
+    print("The hardness values are integrated using cubic spline interpolation, and the task is to maximize the integrated hardness. \n")
+    print("Hardness benchmark compares across random, default, and no task parameter set up. \n")
     print("Hardness transfer learning benchmark compares across different initialized data sizes. ")
 
-
-    #  Visualize the Hardness value histogram
-    # initialize a subplot with 1 row and 1 column
+    # Visualize the Hardness value histogram
     fig, ax = plt.subplots(
         1, 1,
         figsize=(8, 5),
@@ -272,13 +247,9 @@ def hardness_transfer_learning(settings: ConvergenceExperimentSettings) -> DataF
         constrained_layout = True
     )
 
-    # plot a histogram of the hardness values
+    # Plot a histogram of the hardness values
     ax.hist(dfExp["hardness"], bins=20)
-
-    # add a title, x-aixs label, and y-axis label
     ax.set_xlabel("Hardness")
     ax.set_ylabel("Frequency")
     ax.set_title("Integrated Hardness Distribution")
-
-    # add a grid
     ax.grid()