// Predict whether a compound will have a bandgap by training two models: one
//  on compounds that contain entirely metals, one were at least one non-metal is present

// Load in data sets, generate attributes
data = new data.materials.CompositionDataset
data import ./datasets/small_set.txt
data attributes properties directory ./lookup-data/
data attributes properties add set general
data attributes generate

// Get only the ground-state instances of each compound
data duplicates RankingDuplicateResolver minmize PropertyRanker energy_pa SimpleEntryRanker

// Create a new property for each entry that is 1 when bandgap > 0, and 0 otherwise
data target bandgap
data modify NonZeroClassModifier

// Create training and validation sets
train_data = data clone
run_data = train_data split 0.5

// Create the composite model
model = new models.classification.SplitClassifier 

// Define how to partition the data
model splitter AllMetalsSplitter

// Create the algorithm to be used for each partition of the larger, SplitClassifier
leaf = new models.classification.WekaClassifier trees.REPTree

// Define the algorithm stored in "leaf" as the algorithm for all partitions
//  created by "model". In this case, the model creates two partitions:
//    "purely metallic" and  "contains nonmetal"
model submodel set generic $leaf
timer elapsed
// We could also set the algorithms used to model these two partitions separately with:
//   > model submodel set 0 $leaf
//   > model submodel set 1 $leaf
// This is equivalent to the following code, which sets all partitions to use the same algorithm


// Evaluate predictive ability and speed of single model 
echo "Running single model"
timer start train
leaf train $train_data
timer elapsed train
timer start run
leaf validate $run_data
timer elapsed run
print leaf validation stats

// Evaluate predictive ability and speed of split model
echo "Running split model"
timer start train
model train $train_data
timer elapsed train
timer start run
model validate $run_data
timer elapsed run
print model validation stats
	
exit