Cross Validation
Using a TrainedRotation, we can perform k-fold cross-validation on an ENA model
First, we'll load a few libraries:
using EpistemicNetworkAnalysis
using DataFrames
using Statistics
using GLMPrecompiling EpistemicNetworkAnalysis
✓ EpistemicNetworkAnalysis
1 dependency successfully precompiled in 4 seconds. 281 already precompiled.Second, we'll load our data and prepare our model config. We'll be using the FormulaRotation example from the ICQE23 workshop:
data = loadExample("transitions")
deriveAnyCode!(data, :BODY, :Changes, :Mood, :Oily, :Dysphoria, :Cry)
deriveAnyCode!(data, :REFLECT, :Identity, :Longing, :Dream, :Childhood, :Family, :Name, :Letter, :Doubt, :Religion)
deriveAnyCode!(data, :LEARN, :WWW, :Experiment, :Recipe)
deriveAnyCode!(data, :PROGRESS, :Strangers, :Passed, :Out, :Affirmation)
data[!, :All] .= "All"
codes = [:DoseTracking, :SkippedDose, :Happy, :NonHappy, :Sweets, :BODY, :REFLECT, :LEARN, :PROGRESS]
conversations = [:All]
units = [:Date]
rotation = FormulaRotation(
LinearModel, @formula(y ~ 1 + Day), 2, nothing
)Now we can start setting up our cross-validation. We'll give each row a random number from 1 to 5, setting us up for a 5-fold cross-validation.
k_folds = 5
data[!, :Fold] .= rand(1:k_folds, nrow(data))Then, we'll iterate. We'll create a trainmodel with a unitFilter, using the logic row.Fold != i to select all units except our hold out set. After that, we'll create a testmodel with the opposite unitFilter and rotate it using TrainedRotation(trainmodel). That will project our hold out units into our trained embedding. The last thing we'll do in this loop is grab a statistic to add to a results list:
results = Real[]
for i in 1:k_folds
trainmodel = ENAModel(
data, codes, conversations, units,
windowSize=4,
recenterEmpty=true,
rotateBy=rotation,
unitFilter=(row)->(row.Fold != i)
)
testmodel = ENAModel(
data, codes, conversations, units,
windowSize=4,
recenterEmpty=true,
rotateBy=TrainedRotation(trainmodel),
unitFilter=(row)->(row.Fold == i)
)
result = testmodel.embedding[1, :Formula_AdjR2]
push!(results, result)
endFinally, we'll display the results and their mean:
println(results)
println(mean(results))Real[0.6410553378887827, 0.729945937068232, 0.6676411989894473, 0.7021199049311069, 0.7953667271978212]
0.7072258212150779Putting it all together, here is a helper function you should be able to drop-in and apply to your own data:
# Helper
function kfoldcv(wholemodel, k_folds, statistic)
results = Real[]
wholemodel.data[!, :Fold] .= rand(1:k_folds, nrow(data))
for i in 1:k_folds
trainmodel = ENAModel(
wholemodel,
unitFilter=(row)->(row.Fold != i)
)
testmodel = ENAModel(
wholemodel,
rotateBy=TrainedRotation(trainmodel),
unitFilter=(row)->(row.Fold == i)
)
result = testmodel.embedding[1, statistic]
push!(results, result)
end
return results
end
# Example usage
wholemodel = ENAModel(
data, codes, conversations, units,
windowSize=4,
recenterEmpty=true,
rotateBy=rotation
)
results = kfoldcv(wholemodel, 5, :Formula_AdjR2)
println(results)
println(mean(results))Real[0.6818339741324464, 0.838269302919314, 0.6991252844610778, 0.6905659649707705, 0.7167963576788439]
0.7253181768324904