Compute costs matrix from phylogenetic tree and reorganise sources

This commit is contained in:
Dimitri Lozeve 2020-02-22 16:41:25 +01:00
parent c117b9cccb
commit 34d9cd000d
5 changed files with 136 additions and 215 deletions

View file

@ -1,15 +1,10 @@
module GardenOptim
using Logging
using Unicode
using DataFrames
using DocStringExtensions
using CSV
using JSON
using Tables
export loadclassification, loadplants, loadgarden, loadcosts
export loadplants, loadgarden, loadclassification, loadcosts
export update!, randomgardenevolution!, outputgarden
@template (FUNCTIONS, METHODS, MACROS) =
@ -18,112 +13,8 @@ export update!, randomgardenevolution!, outputgarden
$(DOCSTRING)
"""
mutable struct Classification
type::Symbol
name::Symbol
bio::String
children::Vector{Classification}
parent::Classification
function Classification(classif::Dict{String, Any})
children = [Classification(d) for d in get(classif, "children", [])]
type = Symbol(Unicode.normalize(classif["type"], casefold=true, stripmark=true))
name = Symbol(Unicode.normalize(classif["name"], casefold=true, stripmark=true))
classif = new(type, name, get(classif, "bio", ""), children)
for child in children
child.parent = classif
end
classif
end
end
function loadclassification()
clf = JSON.parsefile("data/classification.json")
clf = Classification(clf)
@debug "loaded classification of type $(clf.type)"
clf
end
function loadplants()::DataFrame
plants = CSV.read("data/plants.csv")
@info "loaded $(size(plants, 1)) plants"
plants.name = Symbol.(plants.name)
plants
end
function loadgarden(plants::Vector{String})::Tuple{Matrix{Int}, Matrix{Bool}}
garden = CSV.read("data/garden.csv")
garden = coalesce.(garden, "")
mask = convert(Matrix, garden .== "empty")
garden = indexin(convert(Matrix, garden), plants)
garden = replace(garden, nothing=>0)
@assert size(garden) == size(mask)
@info "loaded garden of size $(size(garden))"
garden, mask
end
function loadcosts()::DataFrame
df = CSV.read("data/associations.csv", copycols=true)
colnames = String.(names(df))
colnames = Symbol.(Unicode.normalize.(colnames, casefold=true, stripmark=true))
rename!(df, colnames)
df.name = colnames[2:end]
# df = coalesce.(df, 0.0)
@info "loaded cost matrix for $(size(df, 1)) plants"
df
end
# function loadcosts()::Matrix{Float64}
# df = CSV.read("data/costs.csv")
# df = coalesce.(df, 0) # replace missing values by 0
# costs = convert(Matrix, df[:, 2:end])
# @info "loaded cost matrix of size $(size(costs))"
# # ensure the matrix is symmetric: keep the max of itself and its transpose
# costs = Float64.(max.(costs, permutedims(costs)))
# end
function getparent(name::Symbol, classification::Classification)
if classification.name == name
return classification.parent
else
for child in classification.children
parent = getparent(name, child)
if !isnothing(parent)
return parent
end
end
end
end
function computecost(costs::DataFrame, plant1::Symbol, plant2::Symbol, classification::Classification)::Float64
@debug "$plant1 and $plant2"
if plant1 in names(costs) && plant2 in names(costs)
cost = costs[costs.name .== plant1, plant2][1]
else
@debug "$plant1 and $plant2 not in costs"
cost = missing
end
if !ismissing(cost)
return cost
end
@debug "missing"
try
parent1 = getparent(plant1, classification).name
computecost(costs, parent1, plant2, classification)
catch UndefRefError
return missing
end
try
parent2 = getparent(plant2, classification).name
computecost(costs, plant1, parent2, classification)
catch UndefRefError
return missing
end
end
include("classification.jl")
include("loaddata.jl")
"Return a random index to be filled from the garden mask."
function randomindex(mask::Matrix{Bool})::Int

50
src/classification.jl Normal file
View file

@ -0,0 +1,50 @@
using Unicode
mutable struct Classification
type::Symbol
name::Symbol
bio::String
children::Vector{Classification}
parent::Classification
function Classification(classif::Dict{String, Any})
children = [Classification(d) for d in get(classif, "children", [])]
type = Symbol(Unicode.normalize(classif["type"], casefold=true, stripmark=true))
name = Symbol(Unicode.normalize(classif["name"], casefold=true, stripmark=true))
classif = new(type, name, get(classif, "bio", ""), children)
for child in children
child.parent = classif
end
classif
end
end
function Base.show(io::IO, clf::Classification)
if length(clf.children) < 2
childrentext = " with $(length(clf.children)) child"
else
childrentext = " with $(length(clf.children)) children"
end
biotext = ""
if clf.bio != ""
biotext = " ($(clf.bio))"
end
print("Classification(", clf.type, " ", clf.name, biotext, childrentext, ")")
end
function getfirstparent(name::Symbol, classification::Classification)
if classification.name == name
parent = classification
while parent.parent.name != :god
parent = parent.parent
end
return parent
else
for child in classification.children
parent = getfirstparent(name, child)
if !isnothing(parent)
return parent
end
end
end
end

83
src/loaddata.jl Normal file
View file

@ -0,0 +1,83 @@
using Logging
using Unicode
using DataFrames
using CSV
using JSON
function loadplants()::DataFrame
plants = CSV.read("data/plants.csv")
@info "loaded $(size(plants, 1)) plants"
plants.name = Symbol.(plants.name)
plants
end
function loadgarden(plants::Vector{Symbol})::Tuple{Matrix{Int}, Matrix{Bool}}
garden = CSV.read("data/garden.csv")
garden = coalesce.(garden, "")
mask = convert(Matrix, garden .== "empty")
garden = Unicode.normalize.(garden, casefold=true, stripmark=true)
garden = indexin(convert(Matrix, garden), String.(plants))
garden = replace(garden, nothing=>0)
@assert size(garden) == size(mask)
@info "loaded garden of size $(size(garden))"
garden, mask
end
function loadclassification()::Classification
clf = JSON.parsefile("data/classification.json")
clf = Classification(clf)
@debug "loaded classification of type $(clf.type)"
clf
end
function loadcostsdf()::DataFrame
df = CSV.read("data/associations.csv", copycols=true)
colnames = String.(names(df))
colnames = Symbol.(Unicode.normalize.(colnames, casefold=true, stripmark=true))
rename!(df, colnames)
df.name = colnames[2:end]
# df = coalesce.(df, 0.0)
@info "loaded cost matrix for $(size(df, 1)) plants"
df
end
function computecost(plant1::Symbol, plant2::Symbol, costs_df::DataFrame, classification::Classification)::Float64
@debug "computecost($plant1, $plant2)"
if plant1 in names(costs_df) && plant2 in names(costs_df)
cost = costs_df[costs_df.name .== plant1, plant2][1]
else
cost = missing
end
if !ismissing(cost)
return cost
end
parent1 = getfirstparent(plant1, classification)
parent2 = getfirstparent(plant2, classification)
if isnothing(parent1) || isnothing(parent2)
return 0.0
end
@debug "computecost($(parent1.name), $(parent2.name))"
if parent1.name in names(costs_df) && parent2.name in names(costs_df)
cost = costs_df[costs_df.name .== parent1.name, parent2.name][1]
end
if !ismissing(cost)
return cost
end
return 0.0
end
function costsmatrix(plants::Vector{Symbol}, costs_df::DataFrame, classification::Classification)::Matrix{Float64}
[computecost(plant1, plant2, costs_df, classification) for plant1 in plants, plant2 in plants]
end
function loadcosts()
plants = loadplants()
clf = loadclassification()
costs_df = loadcostsdf()
costs = costsmatrix(plants.name, costs_df, clf)
end