Compute costs matrix from phylogenetic tree and reorganise sources
This commit is contained in:
parent
c117b9cccb
commit
34d9cd000d
5 changed files with 136 additions and 215 deletions
|
@ -1,15 +1,10 @@
|
|||
module GardenOptim
|
||||
|
||||
using Logging
|
||||
using Unicode
|
||||
|
||||
using DataFrames
|
||||
using DocStringExtensions
|
||||
using CSV
|
||||
using JSON
|
||||
using Tables
|
||||
|
||||
export loadclassification, loadplants, loadgarden, loadcosts
|
||||
export loadplants, loadgarden, loadclassification, loadcosts
|
||||
export update!, randomgardenevolution!, outputgarden
|
||||
|
||||
@template (FUNCTIONS, METHODS, MACROS) =
|
||||
|
@ -18,112 +13,8 @@ export update!, randomgardenevolution!, outputgarden
|
|||
$(DOCSTRING)
|
||||
"""
|
||||
|
||||
mutable struct Classification
|
||||
type::Symbol
|
||||
name::Symbol
|
||||
bio::String
|
||||
children::Vector{Classification}
|
||||
parent::Classification
|
||||
|
||||
function Classification(classif::Dict{String, Any})
|
||||
children = [Classification(d) for d in get(classif, "children", [])]
|
||||
type = Symbol(Unicode.normalize(classif["type"], casefold=true, stripmark=true))
|
||||
name = Symbol(Unicode.normalize(classif["name"], casefold=true, stripmark=true))
|
||||
classif = new(type, name, get(classif, "bio", ""), children)
|
||||
for child in children
|
||||
child.parent = classif
|
||||
end
|
||||
classif
|
||||
end
|
||||
end
|
||||
|
||||
function loadclassification()
|
||||
clf = JSON.parsefile("data/classification.json")
|
||||
clf = Classification(clf)
|
||||
@debug "loaded classification of type $(clf.type)"
|
||||
clf
|
||||
end
|
||||
|
||||
function loadplants()::DataFrame
|
||||
plants = CSV.read("data/plants.csv")
|
||||
@info "loaded $(size(plants, 1)) plants"
|
||||
plants.name = Symbol.(plants.name)
|
||||
plants
|
||||
end
|
||||
|
||||
function loadgarden(plants::Vector{String})::Tuple{Matrix{Int}, Matrix{Bool}}
|
||||
garden = CSV.read("data/garden.csv")
|
||||
garden = coalesce.(garden, "")
|
||||
mask = convert(Matrix, garden .== "empty")
|
||||
garden = indexin(convert(Matrix, garden), plants)
|
||||
garden = replace(garden, nothing=>0)
|
||||
@assert size(garden) == size(mask)
|
||||
@info "loaded garden of size $(size(garden))"
|
||||
garden, mask
|
||||
end
|
||||
|
||||
function loadcosts()::DataFrame
|
||||
df = CSV.read("data/associations.csv", copycols=true)
|
||||
colnames = String.(names(df))
|
||||
colnames = Symbol.(Unicode.normalize.(colnames, casefold=true, stripmark=true))
|
||||
rename!(df, colnames)
|
||||
df.name = colnames[2:end]
|
||||
# df = coalesce.(df, 0.0)
|
||||
@info "loaded cost matrix for $(size(df, 1)) plants"
|
||||
df
|
||||
end
|
||||
|
||||
# function loadcosts()::Matrix{Float64}
|
||||
# df = CSV.read("data/costs.csv")
|
||||
# df = coalesce.(df, 0) # replace missing values by 0
|
||||
# costs = convert(Matrix, df[:, 2:end])
|
||||
# @info "loaded cost matrix of size $(size(costs))"
|
||||
# # ensure the matrix is symmetric: keep the max of itself and its transpose
|
||||
# costs = Float64.(max.(costs, permutedims(costs)))
|
||||
# end
|
||||
|
||||
function getparent(name::Symbol, classification::Classification)
|
||||
if classification.name == name
|
||||
return classification.parent
|
||||
else
|
||||
for child in classification.children
|
||||
parent = getparent(name, child)
|
||||
if !isnothing(parent)
|
||||
return parent
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function computecost(costs::DataFrame, plant1::Symbol, plant2::Symbol, classification::Classification)::Float64
|
||||
@debug "$plant1 and $plant2"
|
||||
if plant1 in names(costs) && plant2 in names(costs)
|
||||
cost = costs[costs.name .== plant1, plant2][1]
|
||||
else
|
||||
@debug "$plant1 and $plant2 not in costs"
|
||||
cost = missing
|
||||
end
|
||||
|
||||
if !ismissing(cost)
|
||||
return cost
|
||||
end
|
||||
|
||||
@debug "missing"
|
||||
try
|
||||
parent1 = getparent(plant1, classification).name
|
||||
computecost(costs, parent1, plant2, classification)
|
||||
catch UndefRefError
|
||||
return missing
|
||||
end
|
||||
|
||||
try
|
||||
parent2 = getparent(plant2, classification).name
|
||||
computecost(costs, plant1, parent2, classification)
|
||||
catch UndefRefError
|
||||
return missing
|
||||
end
|
||||
|
||||
end
|
||||
include("classification.jl")
|
||||
include("loaddata.jl")
|
||||
|
||||
"Return a random index to be filled from the garden mask."
|
||||
function randomindex(mask::Matrix{Bool})::Int
|
||||
|
|
50
src/classification.jl
Normal file
50
src/classification.jl
Normal file
|
@ -0,0 +1,50 @@
|
|||
using Unicode
|
||||
|
||||
mutable struct Classification
|
||||
type::Symbol
|
||||
name::Symbol
|
||||
bio::String
|
||||
children::Vector{Classification}
|
||||
parent::Classification
|
||||
|
||||
function Classification(classif::Dict{String, Any})
|
||||
children = [Classification(d) for d in get(classif, "children", [])]
|
||||
type = Symbol(Unicode.normalize(classif["type"], casefold=true, stripmark=true))
|
||||
name = Symbol(Unicode.normalize(classif["name"], casefold=true, stripmark=true))
|
||||
classif = new(type, name, get(classif, "bio", ""), children)
|
||||
for child in children
|
||||
child.parent = classif
|
||||
end
|
||||
classif
|
||||
end
|
||||
end
|
||||
|
||||
function Base.show(io::IO, clf::Classification)
|
||||
if length(clf.children) < 2
|
||||
childrentext = " with $(length(clf.children)) child"
|
||||
else
|
||||
childrentext = " with $(length(clf.children)) children"
|
||||
end
|
||||
biotext = ""
|
||||
if clf.bio != ""
|
||||
biotext = " ($(clf.bio))"
|
||||
end
|
||||
print("Classification(", clf.type, " ", clf.name, biotext, childrentext, ")")
|
||||
end
|
||||
|
||||
function getfirstparent(name::Symbol, classification::Classification)
|
||||
if classification.name == name
|
||||
parent = classification
|
||||
while parent.parent.name != :god
|
||||
parent = parent.parent
|
||||
end
|
||||
return parent
|
||||
else
|
||||
for child in classification.children
|
||||
parent = getfirstparent(name, child)
|
||||
if !isnothing(parent)
|
||||
return parent
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
83
src/loaddata.jl
Normal file
83
src/loaddata.jl
Normal file
|
@ -0,0 +1,83 @@
|
|||
using Logging
|
||||
using Unicode
|
||||
|
||||
using DataFrames
|
||||
using CSV
|
||||
using JSON
|
||||
|
||||
function loadplants()::DataFrame
|
||||
plants = CSV.read("data/plants.csv")
|
||||
@info "loaded $(size(plants, 1)) plants"
|
||||
plants.name = Symbol.(plants.name)
|
||||
plants
|
||||
end
|
||||
|
||||
function loadgarden(plants::Vector{Symbol})::Tuple{Matrix{Int}, Matrix{Bool}}
|
||||
garden = CSV.read("data/garden.csv")
|
||||
garden = coalesce.(garden, "")
|
||||
mask = convert(Matrix, garden .== "empty")
|
||||
garden = Unicode.normalize.(garden, casefold=true, stripmark=true)
|
||||
garden = indexin(convert(Matrix, garden), String.(plants))
|
||||
garden = replace(garden, nothing=>0)
|
||||
@assert size(garden) == size(mask)
|
||||
@info "loaded garden of size $(size(garden))"
|
||||
garden, mask
|
||||
end
|
||||
|
||||
function loadclassification()::Classification
|
||||
clf = JSON.parsefile("data/classification.json")
|
||||
clf = Classification(clf)
|
||||
@debug "loaded classification of type $(clf.type)"
|
||||
clf
|
||||
end
|
||||
|
||||
function loadcostsdf()::DataFrame
|
||||
df = CSV.read("data/associations.csv", copycols=true)
|
||||
colnames = String.(names(df))
|
||||
colnames = Symbol.(Unicode.normalize.(colnames, casefold=true, stripmark=true))
|
||||
rename!(df, colnames)
|
||||
df.name = colnames[2:end]
|
||||
# df = coalesce.(df, 0.0)
|
||||
@info "loaded cost matrix for $(size(df, 1)) plants"
|
||||
df
|
||||
end
|
||||
|
||||
function computecost(plant1::Symbol, plant2::Symbol, costs_df::DataFrame, classification::Classification)::Float64
|
||||
@debug "computecost($plant1, $plant2)"
|
||||
if plant1 in names(costs_df) && plant2 in names(costs_df)
|
||||
cost = costs_df[costs_df.name .== plant1, plant2][1]
|
||||
else
|
||||
cost = missing
|
||||
end
|
||||
|
||||
if !ismissing(cost)
|
||||
return cost
|
||||
end
|
||||
|
||||
parent1 = getfirstparent(plant1, classification)
|
||||
parent2 = getfirstparent(plant2, classification)
|
||||
if isnothing(parent1) || isnothing(parent2)
|
||||
return 0.0
|
||||
end
|
||||
@debug "computecost($(parent1.name), $(parent2.name))"
|
||||
if parent1.name in names(costs_df) && parent2.name in names(costs_df)
|
||||
cost = costs_df[costs_df.name .== parent1.name, parent2.name][1]
|
||||
end
|
||||
|
||||
if !ismissing(cost)
|
||||
return cost
|
||||
end
|
||||
|
||||
return 0.0
|
||||
end
|
||||
|
||||
function costsmatrix(plants::Vector{Symbol}, costs_df::DataFrame, classification::Classification)::Matrix{Float64}
|
||||
[computecost(plant1, plant2, costs_df, classification) for plant1 in plants, plant2 in plants]
|
||||
end
|
||||
|
||||
function loadcosts()
|
||||
plants = loadplants()
|
||||
clf = loadclassification()
|
||||
costs_df = loadcostsdf()
|
||||
costs = costsmatrix(plants.name, costs_df, clf)
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue