Dissertation: final update

This commit is contained in:
Dimitri Lozeve 2018-09-10 11:27:11 +02:00
parent baec57c51d
commit aedd94d4af
30 changed files with 17967 additions and 243 deletions

77
dissertation/Other.bib Normal file
View file

@ -0,0 +1,77 @@
@book{hastie_elements_2009,
title = {The Elements of Statistical Learning},
volume = {1},
isbn = {978-0-387-84857-0},
url = {http://www.springerlink.com/index/10.1007/b94608},
abstract = {During the past decade there has been an explosion in computation and information technology. With it has come vast amounts of data in a variety of fields such as medicine, biology, finance, and marketing. The challenge of understanding these data has led to the development of new tools in the field of statistics, and spawned new areas such as data mining, machine learning, and bioinformatics. Many of these tools have common underpinnings but are often expressed with different terminology. This book describes the important ideas in these areas in a common conceptual framework. While the approach is statistical, the emphasis is on concepts rather than mathematics. Many examples are given, with a liberal use of color graphics. It should be a valuable resource for statisticians and anyone interested in data mining in science or industry. The book's coverage is broad, from supervised learning (prediction) to unsupervised learning. The many topics include neural networks, support vector machines, classification trees and boosting-the first comprehensive treatment of this topic in any book. Trevor Hastie, Robert Tibshirani, and Jerome Friedman are professors of statistics at Stanford University. They are prominent researchers in this area: Hastie and Tibshirani developed generalized additive models and wrote a popular book of that title. Hastie wrote much of the statistical modeling software in S-{PLUS} and invented principal curves and surfaces. Tibshirani proposed the Lasso and is co-author of the very successful An Introduction to the Bootstrap. Friedman is the co-inventor of many data-mining tools including {CART}, {MARS}, and projection pursuit. {FROM} {THE} {REVIEWS}: {TECHNOMETRICS} "This is a vast and complex book. Generally, it concentrates on explaining why and how the methods work, rather than how to use them. Examples and especially the visualizations are principle features...As a source for the methods of statistical learning...it will probably be a long time before there is a competitor to this book."},
pagetotal = {1694},
author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
date = {2009},
doi = {10.1007/b94608},
pmid = {12377617},
file = {Attachment:/home/dimitri/Zotero/storage/AVAIRYAL/Hastie, Tibshirani, Friedman - Unknown - Springer Series in Statistics The Elements of Statistical Learning The Elements of Statistical.pdf:application/pdf}
}
@incollection{karp_reducibility_2010,
title = {Reducibility among combinatorial problems},
isbn = {978-3-540-68274-5},
url = {http://www.springerlink.com/index/10.1007/978-1-4684-2001-2_9%5Cnpapers3://publication/doi/10.1007/978-1-4684-2001-2_9},
abstract = {A large class of computational problems involve the determination of properties of graphs, digraphs, integers, arrays of integers, finite families of finite sets, boolean formulas and elements of other countable domains. Through simple encodings from such domains into the set of words over a finite alphabet these problems can be converted into language recognition problems, and we can inquire into their computational complexity. It is reasonable to consider such a problem satisfactorily solved when an algorithm for its solution is found which terminates within a number of steps bounded by a polynomial in the length of the input. We show that a large number of classic unsolved problems of covering, matching, packing, routing, assignment and sequencing are equivalent, in the sense that either each of them possesses a polynomial-bounded algorithm or none of them does.},
pages = {219--241},
number = {Chapter 9},
booktitle = {50 Years of Integer Programming 1958-2008: From the Early Years to the State-of-the-Art},
author = {Karp, Richard M.},
date = {2010},
doi = {10.1007/978-3-540-68279-0_8},
pmid = {15890271},
file = {Attachment:/home/dimitri/Zotero/storage/4IAHTPB5/Karp - 1972 - Reducibility among Combinatorial Problems BT - (null).pdf:application/pdf}
}
@article{tomita_worst-case_2006,
title = {The worst-case time complexity for generating all maximal cliques and computational experiments},
volume = {363},
issn = {0304-3975},
url = {http://www.sciencedirect.com/science/article/pii/S0304397506003586},
doi = {10.1016/j.tcs.2006.06.015},
series = {Computing and Combinatorics},
abstract = {We present a depth-first search algorithm for generating all maximal cliques of an undirected graph, in which pruning methods are employed as in the BronKerbosch algorithm. All the maximal cliques generated are output in a tree-like form. Subsequently, we prove that its worst-case time complexity is O(3n/3) for an n-vertex graph. This is optimal as a function of n, since there exist up to 3n/3 maximal cliques in an n-vertex graph. The algorithm is also demonstrated to run very fast in practice by computational experiments.},
pages = {28--42},
number = {1},
journaltitle = {Theoretical Computer Science},
shortjournal = {Theoretical Computer Science},
author = {Tomita, Etsuji and Tanaka, Akira and Takahashi, Haruhisa},
urldate = {2018-07-31},
date = {2006-10-25},
keywords = {Computational experiments, Enumeration, Maximal cliques, Worst-case time complexity},
file = {ScienceDirect Full Text PDF:/home/dimitri/Zotero/storage/QDLTAXHX/Tomita et al. - 2006 - The worst-case time complexity for generating all .pdf:application/pdf;ScienceDirect Snapshot:/home/dimitri/Zotero/storage/TCJ8J7MV/S0304397506003586.html:text/html}
}
@article{pedregosa_scikit-learn:_2011,
title = {Scikit-learn: Machine Learning in Python},
volume = {12},
issn = {1533-7928},
url = {http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html},
shorttitle = {Scikit-learn},
abstract = {Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language. Emphasis is put on ease of use, performance, documentation, and {API} consistency. It has minimal dependencies and is distributed under the simplified {BSD} license, encouraging its use in both academic and commercial settings. Source code, binaries, and documentation can be downloaded from http://scikit-learn.sourceforge.net.},
pages = {28252830},
journaltitle = {Journal of Machine Learning Research},
author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard},
urldate = {2018-09-02},
date = {2011-10},
file = {Fulltext PDF:/home/dimitri/Zotero/storage/6SAE9PPD/Pedregosa et al. - 2011 - Scikit-learn Machine Learning in Python.pdf:application/pdf}
}
@misc{sejdinovic_advanced_2018,
title = {Advanced Topics in Statistical Machine Learning},
url = {http://www.stats.ox.ac.uk/%7Esejdinov/atsml/},
author = {Sejdinovic, Dino},
date = {2018-02-10}
}
@misc{golse_mat321_2015,
title = {{MAT}321 Analyse réelle},
publisher = {École polytechnique},
author = {Golse, François and Laszlo, Yves and Pacard, Frank and Viterbo, Claude},
date = {2015}
}

View file

@ -738,18 +738,6 @@ novel application of the discriminatory power of {PIs}.},
file = {Fulltext PDF:/home/dimitri/Zotero/storage/EUWNMLQF/Adams et al. - 2017 - Persistence Images A Stable Vector Representation.pdf:application/pdf} file = {Fulltext PDF:/home/dimitri/Zotero/storage/EUWNMLQF/Adams et al. - 2017 - Persistence Images A Stable Vector Representation.pdf:application/pdf}
} }
@article{bubenik_statistical_2015-1,
title = {Statistical Topological Data Analysis using Persistence Landscapes},
volume = {16},
url = {http://www.jmlr.org/papers/v16/bubenik15a.html},
pages = {77--102},
journaltitle = {Journal of Machine Learning Research},
author = {Bubenik, Peter},
urldate = {2018-06-12},
date = {2015},
file = {Full Text PDF:/home/dimitri/Zotero/storage/CJW9F5XG/Bubenik - 2015 - Statistical Topological Data Analysis using Persis.pdf:application/pdf;Snapshot:/home/dimitri/Zotero/storage/E2DN26NP/bubenik15a.html:text/html}
}
@article{kalisnik_tropical_2018, @article{kalisnik_tropical_2018,
title = {Tropical Coordinates on the Space of Persistence Barcodes}, title = {Tropical Coordinates on the Space of Persistence Barcodes},
issn = {1615-3375, 1615-3383}, issn = {1615-3375, 1615-3383},
@ -874,3 +862,186 @@ novel application of the discriminatory power of {PIs}.},
date = {2010}, date = {2010},
keywords = {computational topology, simplicial set, vietoris-rips complex, witness complex} keywords = {computational topology, simplicial set, vietoris-rips complex, witness complex}
} }
@inproceedings{zeppelzauer_topological_2016,
title = {Topological Descriptors for 3D Surface Analysis},
isbn = {978-3-319-39440-4 978-3-319-39441-1},
url = {https://link.springer.com/chapter/10.1007/978-3-319-39441-1_8},
doi = {10.1007/978-3-319-39441-1_8},
series = {Lecture Notes in Computer Science},
abstract = {We investigate topological descriptors for 3D surface analysis, i.e. the classification of surfaces according to their geometric fine structure. On a dataset of high-resolution 3D surface reconstructions we compute persistence diagrams for a 2D cubical filtration. In the next step we investigate different topological descriptors and measure their ability to discriminate structurally different 3D surface patches. We evaluate their sensitivity to different parameters and compare the performance of the resulting topological descriptors to alternative (non-topological) descriptors. We present a comprehensive evaluation that shows that topological descriptors are (i) robust, (ii) yield state-of-the-art performance for the task of 3D surface analysis and (iii) improve classification performance when combined with non-topological descriptors.},
eventtitle = {International Workshop on Computational Topology in Image Context},
pages = {77--87},
booktitle = {Computational Topology in Image Context},
publisher = {Springer, Cham},
author = {Zeppelzauer, Matthias and Zieliński, Bartosz and Juda, Mateusz and Seidl, Markus},
urldate = {2018-08-16},
date = {2016-06-15},
langid = {english},
file = {Snapshot:/home/dimitri/Zotero/storage/JH8QTE5R/978-3-319-39441-1_8.html:text/html}
}
@article{muandet_kernel_2017,
title = {Kernel Mean Embedding of Distributions: A Review and Beyond},
volume = {10},
issn = {1935-8237, 1935-8245},
url = {https://www.nowpublishers.com/article/Details/MAL-060},
doi = {10.1561/2200000060},
shorttitle = {Kernel Mean Embedding of Distributions},
abstract = {Kernel Mean Embedding of Distributions: A Review and Beyond},
pages = {1--141},
number = {1},
journaltitle = {Foundations and Trends® in Machine Learning},
shortjournal = {{MAL}},
author = {Muandet, Krikamol and Fukumizu, Kenji and Sriperumbudur, Bharath and Schölkopf, Bernhard},
urldate = {2018-08-30},
date = {2017-06-28},
file = {Full Text PDF:/home/dimitri/Zotero/storage/87JN65NM/Muandet et al. - 2017 - Kernel Mean Embedding of Distributions A Review a.pdf:application/pdf;Snapshot:/home/dimitri/Zotero/storage/7DS27M8D/MAL-060.html:text/html}
}
@book{berlinet_reproducing_2011,
title = {Reproducing Kernel Hilbert Spaces in Probability and Statistics},
isbn = {978-1-4419-9096-9},
abstract = {The reproducing kernel Hilbert space construction is a bijection or transform theory which associates a positive definite kernel (gaussian processes) with a Hilbert space offunctions. Like all transform theories (think Fourier), problems in one space may become transparent in the other, and optimal solutions in one space are often usefully optimal in the other. The theory was born in complex function theory, abstracted and then accidently injected into Statistics; Manny Parzen as a graduate student at Berkeley was given a strip of paper containing his qualifying exam problem- It read "reproducing kernel Hilbert space"- In the 1950's this was a truly obscure topic. Parzen tracked it down and internalized the subject. Soon after, he applied it to problems with the following fla vor: consider estimating the mean functions of a gaussian process. The mean functions which cannot be distinguished with probability one are precisely the functions in the Hilbert space associated to the covariance kernel of the processes. Parzen's own lively account of his work on re producing kernels is charmingly told in his interview with H. Joseph Newton in Statistical Science, 17, 2002, p. 364-366. Parzen moved to Stanford and his infectious enthusiasm caught Jerry Sacks, Don Ylvisaker and Grace Wahba among others. Sacks and Ylvis aker applied the ideas to design problems such as the following. Sup pose ({XdO}},
pagetotal = {369},
publisher = {Springer Science \& Business Media},
author = {Berlinet, Alain and Thomas-Agnan, Christine},
date = {2011-06-28},
langid = {english},
note = {Google-Books-{ID}: {bX}3TBwAAQBAJ},
keywords = {Business \& Economics / Economics / General, Business \& Economics / Economics / Theory, Business \& Economics / General, Business \& Economics / Statistics, Mathematics / Probability \& Statistics / General}
}
@thesis{price-wright_topological_2015,
title = {A Topological Approach to Temporal Networks},
abstract = {“Temporal networks” are a mathematical tool to represent systems that change
over time. Research on temporal networks is very active, and limited theoreti-
cal work has been done to study them. One approach to is to construct a series
of static subgraphs called snapshots. Existing techniques attempt to find the
temporal structure of a network to inform its partitioning into snapshots. An
important goal of such methods is to uncover meaningful temporal structure
that corresponds to actual features of the underlying system.
We investigate existing methods used to partition temporal networks based
on di↵erent ways of identifying temporal structure. Such methods have never
previously been compared directly to each other, so we examine and evaluate
their performance side-by-side on a suite of random-graph ensembles. We
show that without prior knowledge about a networks temporal structure,
these existing methods have limitations producing meaningful partitions.
To tackle the problem of finding temporal structure in a network, we ap-
ply methods from computational topology. Such methods have begun to be
employed in the study of static networks and provide a summary of global
features in data sets. We use them here to track the topology of a network
over time and distinguish important temporal features from trivial ones. We
define two types of topological spaces derived from temporal networks and use
persistent homology to generate a temporal profile for a network. We then
present di↵erent ways to use this to understand a networks temporal struc-
ture with limited prior knowledge. We show that the methods we apply from
computational topology can distinguish temporal distributions and provide a
high-level summary of temporal structure. These combined can be used to
inform a meaningful network partitioning and a deeper understanding of a
temporal network itself.},
institution = {University of Oxford},
type = {{MSc} dissertation in Mathematics and Foundations of Computer Science},
author = {Price-Wright, Erin},
date = {2015},
file = {Price-Wrigt - 2015 - A Topological Approach to Temporal Networks.pdf:/home/dimitri/Zotero/storage/6YI5RC6K/Price-Wrigt - 2015 - A Topological Approach to Temporal Networks.pdf:application/pdf}
}
@inproceedings{edelsbrunner_topological_2000,
title = {Topological persistence and simplification},
doi = {10.1109/SFCS.2000.892133},
abstract = {We formalize a notion of topological simplification within the framework of a filtration, which is the history of a growing complex. We classify a topological change that happens during growth as either a feature or noise, depending on its life-time or persistence within the filtration. We give fast algorithms for completing persistence and experimental evidence for their speed and utility.},
eventtitle = {Proceedings 41st Annual Symposium on Foundations of Computer Science},
pages = {454--463},
booktitle = {Proceedings 41st Annual Symposium on Foundations of Computer Science},
author = {Edelsbrunner, H. and Letscher, D. and Zomorodian, A.},
date = {2000-11},
keywords = {Topology, History, computational topology, algorithm theory, alpha shapes, computational geometry, Computational geometry, Computer graphics, Computer science, Density functional theory, fast algorithms, filtration, Filtration, growing complex, homology groups, Mathematics, Noise shaping, Shape, topological change, topological persistence, topological simplification, topology},
file = {IEEE Xplore Abstract Record:/home/dimitri/Zotero/storage/5LPIWG5Z/892133.html:text/html}
}
@article{morozov_persistence_2005,
title = {Persistence algorithm takes cubic time in worst case},
abstract = {Given a sequence of N simplices, we consider the sequence of sets Ki consisting of the first i simplices, for 1 ≤ i ≤ N. We call the sequence of Ki a filtration if all the Ki are simplicial complexes. In this note, we describe a filtration of a simplicial complex of N simplices on which the algorithm Pair-Simplices of Edelsbrunner, Letscher and Zomorodian [1] performs Ω(N 3) operations. The existence of this filtration should be contrasted to the experimentally observed only slightly super-linear running time for filtrations that arise from applications. We describe the space as well as the ordering on the simplices. Let n = ⌊(N + 29)/7⌋, v = ⌊(n 1)/2⌋, and note that both n and v are in Ω(N). In our filtration, all vertices appear before all edges in the filtration, and all edges appear before all triangles. The indices that we assign to the simplices will be within their respective classes (e.g., edge labeled n will appear before the triangle labeled 1). Some edges will receive a negative index, which is done for simplicity to indicate that they appear before the edges with positive labels (see Figure 2). Figure 1 illustrates the construction of our space as well as the assignment of indices to the simplices. Starting with triangle {ABC}, we add v vertices inside the triangle in the following manner: we place the first vertex V1 near the middle of edge {AB}, the second vertex V2 near the middle of},
journaltitle = {{BioGeometry} News, Dept. Comput. Sci., Duke Univ},
author = {Morozov, Dmitriy},
date = {2005},
file = {Citeseer - Full Text PDF:/home/dimitri/Zotero/storage/I6HKHZQ5/Morozov - 2005 - Persistence algorithm takes cubic time in worst ca.pdf:application/pdf;Citeseer - Snapshot:/home/dimitri/Zotero/storage/WN8ZVZH9/summary.html:text/html}
}
@article{de_silva_persistent_2011,
title = {Persistent Cohomology and Circular Coordinates},
volume = {45},
issn = {1432-0444},
url = {https://doi.org/10.1007/s00454-011-9344-x},
doi = {10.1007/s00454-011-9344-x},
abstract = {Nonlinear dimensionality reduction ({NLDR}) algorithms such as Isomap, {LLE}, and Laplacian Eigenmaps address the problem of representing high-dimensional nonlinear data in terms of low-dimensional coordinates which represent the intrinsic structure of the data. This paradigm incorporates the assumption that real-valued coordinates provide a rich enough class of functions to represent the data faithfully and efficiently. On the other hand, there are simple structures which challenge this assumption: the circle, for example, is one-dimensional, but its faithful representation requires two real coordinates. In this work, we present a strategy for constructing circle-valued functions on a statistical data set. We develop a machinery of persistent cohomology to identify candidates for significant circle-structures in the data, and we use harmonic smoothing and integration to obtain the circle-valued coordinate functions themselves. We suggest that this enriched class of coordinate functions permits a precise {NLDR} analysis of a broader range of realistic data sets.},
pages = {737--759},
number = {4},
journaltitle = {Discrete \& Computational Geometry},
shortjournal = {Discrete Comput Geom},
author = {de Silva, Vin and Morozov, Dmitriy and Vejdemo-Johansson, Mikael},
urldate = {2018-09-05},
date = {2011-06-01},
langid = {english},
keywords = {Persistent homology, Computational topology, Dimensionality reduction, Persistent cohomology},
file = {Springer Full Text PDF:/home/dimitri/Zotero/storage/EX9L3F7F/de Silva et al. - 2011 - Persistent Cohomology and Circular Coordinates.pdf:application/pdf}
}
@article{de_silva_dualities_2011,
title = {Dualities in persistent (co)homology},
volume = {27},
issn = {0266-5611, 1361-6420},
url = {http://stacks.iop.org/0266-5611/27/i=12/a=124003?key=crossref.1f4b24ef80c9b1fc789ecdc6221097de},
doi = {10.1088/0266-5611/27/12/124003},
pages = {124003},
number = {12},
journaltitle = {Inverse Problems},
author = {de Silva, Vin and Morozov, Dmitriy and Vejdemo-Johansson, Mikael},
urldate = {2018-09-05},
date = {2011-12-01}
}
@incollection{mcgeoch_distributed_2014,
location = {Philadelphia, {PA}},
title = {Distributed Computation of Persistent Homology},
isbn = {978-1-61197-319-8},
url = {http://epubs.siam.org/doi/abs/10.1137/1.9781611973198.4},
pages = {31--38},
booktitle = {2014 Proceedings of the Sixteenth Workshop on Algorithm Engineering and Experiments ({ALENEX})},
publisher = {Society for Industrial and Applied Mathematics},
author = {Bauer, Ulrich and Kerber, Michael and Reininghaus, Jan},
editor = {{McGeoch}, Catherine C. and Meyer, Ulrich},
urldate = {2018-09-05},
date = {2014-05},
langid = {english},
doi = {10.1137/1.9781611973198.4}
}
@article{carlsson_zigzag_2008,
title = {Zigzag Persistence},
url = {http://arxiv.org/abs/0812.0197},
abstract = {We describe a new methodology for studying persistence of topological features across a family of spaces or point-cloud data sets, called zigzag persistence. Building on classical results about quiver representations, zigzag persistence generalises the highly successful theory of persistent homology and addresses several situations which are not covered by that theory. In this paper we develop theoretical and algorithmic foundations with a view towards applications in topological statistics.},
journaltitle = {{arXiv}:0812.0197 [cs]},
author = {Carlsson, Gunnar and de Silva, Vin},
urldate = {2018-09-08},
date = {2008-11-30},
eprinttype = {arxiv},
eprint = {0812.0197},
keywords = {Computer Science - Computational Geometry, I.3.5},
file = {arXiv\:0812.0197 PDF:/home/dimitri/Zotero/storage/PKSM89FF/Carlsson and de Silva - 2008 - Zigzag Persistence.pdf:application/pdf;arXiv.org Snapshot:/home/dimitri/Zotero/storage/QF37EI5F/0812.html:text/html}
}
@article{maria_computing_2016,
title = {Computing Zigzag Persistent Cohomology},
url = {http://arxiv.org/abs/1608.06039},
abstract = {Zigzag persistent homology is a powerful generalisation of persistent homology that allows one not only to compute persistence diagrams with less noise and using less memory, but also to use persistence in new fields of application. However, due to the increase in complexity of the algebraic treatment of the theory, most algorithmic results in the field have remained of theoretical nature. This article describes an efficient algorithm to compute zigzag persistence, emphasising on its practical interest. The algorithm is a zigzag persistent cohomology algorithm, based on the dualisation of reflections and transpositions transformations within the zigzag sequence. We provide an extensive experimental study of the algorithm. We study the algorithm along two directions. First, we compare its performance with zigzag persistent homology algorithm and show the interest of cohomology in zigzag persistence. Second, we illustrate the interest of zigzag persistence in topological data analysis by comparing it to state of the art methods in the field, specifically optimised algorithm for standard persistent homology and sparse filtrations. We compare the memory and time complexities of the different algorithms, as well as the quality of the output persistence diagrams.},
journaltitle = {{arXiv}:1608.06039 [cs]},
author = {Maria, Clément and Oudot, Steve},
urldate = {2018-09-08},
date = {2016-08-21},
eprinttype = {arxiv},
eprint = {1608.06039},
keywords = {Computer Science - Computational Geometry},
file = {arXiv\:1608.06039 PDF:/home/dimitri/Zotero/storage/LJBHWTMY/Maria and Oudot - 2016 - Computing Zigzag Persistent Cohomology.pdf:application/pdf;arXiv.org Snapshot:/home/dimitri/Zotero/storage/TCKJGZET/1608.html:text/html}
}

119
dissertation/clustering.py Normal file
View file

@ -0,0 +1,119 @@
#!/usr/bin/env python3
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.svm import OneClassSVM
import dill
import matplotlib
matplotlib.use("PDF")
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (10, 6)
N_CLUSTERS = 10
GENERATIVE = True
SOCIOPATTERNS = True
if __name__=="__main__":
if GENERATIVE:
print("==== Generative model ====")
zz_dgms = dill.load(open("generative/zz_dgms.dill", "rb"))
wrcf_dgms = dill.load(open("generative/wrcf_dgms.dill", "rb"))
zz_gram1 = dill.load(open("generative/zz_gram1.dill", "rb"))
wrcf_gram1 = dill.load(open("generative/wrcf_gram1.dill", "rb"))
zz_distmat = dill.load(open("generative/zz_distmat.dill", "rb"))
wrcf_distmat = dill.load(open("generative/wrcf_distmat.dill", "rb"))
print("Zigzag + kernel")
clf = AgglomerativeClustering(
n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(zz_gram1)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/gen_zz_k.pdf", transparent=True,
pad_inches=0.3, bbox_inches="tight")
print("WRCF + kernel")
clf = AgglomerativeClustering(
n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(wrcf_gram1)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/gen_wrcf_k.pdf", transparent=True,
pad_inches=0.3, bbox_inches="tight")
print("Zigzag + bottleneck")
clf = AgglomerativeClustering(
n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(zz_distmat)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/gen_zz_b.pdf", transparent=True,
pad_inches=0.3, bbox_inches="tight")
print("WRCF + bottleneck")
clf = AgglomerativeClustering(
n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(wrcf_distmat)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/gen_wrcf_b.pdf", transparent=True,
pad_inches=0.3, bbox_inches="tight")
if SOCIOPATTERNS:
print("==== SocioPatterns dataset ====")
zz_dgms = dill.load(open("sociopatterns/zz_dgms.dill", "rb"))
wrcf_dgms = dill.load(open("sociopatterns/wrcf_dgms.dill", "rb"))
zz_gram1 = dill.load(open("sociopatterns/zz_gram1.dill", "rb"))
wrcf_gram1 = dill.load(open("sociopatterns/wrcf_gram1.dill", "rb"))
zz_distmat = dill.load(open("sociopatterns/zz_distmat.dill", "rb"))
wrcf_distmat = dill.load(open("sociopatterns/wrcf_distmat.dill", "rb"))
print("Zigzag + kernel")
clf = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(zz_gram1)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/sp_zz_k.pdf", transparent=True, pad_inches=0.3, bbox_inches="tight")
print("WRCF + kernel")
clf = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(wrcf_gram1)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/sp_wrcf_k.pdf", transparent=True, pad_inches=0.3, bbox_inches="tight")
print("Zigzag + bottleneck")
clf = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(zz_distmat)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/sp_zz_b.pdf", transparent=True, pad_inches=0.3, bbox_inches="tight")
print("WRCF + bottleneck")
clf = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='precomputed', linkage='average')
clf.fit(wrcf_distmat)
fig, ax = plt.subplots()
ax.step(range(len(clf.labels_)), clf.labels_, where='post')
ax.set_xlabel("Subnetwork")
ax.set_ylabel("Cluster")
fig.savefig("fig/sp_wrcf_b.pdf", transparent=True, pad_inches=0.3, bbox_inches="tight")

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 387 KiB

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

128
dissertation/generative.py Normal file
View file

@ -0,0 +1,128 @@
#!/usr/bin/env python3
import numpy as np
import igraph as ig
import dionysus as d
import multiprocessing
# from dask.distributed import Client
from zigzag import sliding_windows, zigzag_network
from wrcf import wrcf_diagram
from sliced_wasserstein import diagram_array, SW_approx
import dill
def random_edge_presences(T, f):
"""Generate random times sampled over a periodic distribution.
:param T: time range
:param f: frequency
:return: an array of times.
"""
density = np.sin(f * np.arange(T)) + 1
density /= np.sum(density)
samplesize = np.random.randint(T//2)
times = np.random.choice(np.arange(T), size=samplesize, replace=False, p=density)
times = np.sort(times)
return times
def remove_inf(dgm):
"""Remove infinite points in a persistence diagram.
:param dgm: Diagram
:return: the same diagram without the infinite points.
"""
res = d.Diagram()
for p in dgm:
if p.death != np.inf:
res.append(p)
return res
## Global parameters
NODES = 40
EDGE_PROB = 0.9
TIME_RANGE = 200
FREQ = 15/TIME_RANGE
N_WINDOWS = 20
## Computations
ZIGZAG_PERS = True
WRCF_PERS = True
SW_KERNEL = True
BOTTLENECK_DIST = True
if __name__=="__main__":
print("Generating random temporal network...", end="", flush=True)
basegraph = ig.Graph.Erdos_Renyi(NODES, EDGE_PROB)
g = ig.Graph()
g.add_vertices(len(basegraph.vs))
for e in basegraph.es:
times = random_edge_presences(TIME_RANGE, FREQ)
for t in times:
g.add_edge(e.source, e.target, time=t)
print("done.")
print("Temporal partitioning...", end="", flush=True)
wins = sliding_windows(g, 1/N_WINDOWS)
print("done.")
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
if ZIGZAG_PERS:
print("Zigzag persistence...", end="", flush=True)
zz_dgms = pool.map(zigzag_network, wins)
dill.dump(zz_dgms, open("generative/zz_dgms.dill", "wb"))
print("done, saved.")
if WRCF_PERS:
print("WRCF...", end="", flush=True)
## Collapse each subnetwork into a static graph: the weight is the
## number of appearances of each edge
for w in wins:
w.es["time"] = np.repeat(1, len(w.es["time"]))
w.simplify(combine_edges="sum")
w.es["weight"] = w.es["time"]
del w.es["time"]
wrcf_dgms = pool.map(wrcf_diagram, wins)
dill.dump(wrcf_dgms, open("generative/wrcf_dgms.dill", "wb"))
print("done.")
pool.terminate()
if ZIGZAG_PERS and SW_KERNEL:
print("Sliced Wasserstein Kernel (zigzag)...", end="", flush=True)
zz_dgms1 = [dgm[1] for dgm in zz_dgms if len(dgm) > 1]
zz_gram1 = np.array([[SW_approx(zz_dgms1[i], zz_dgms1[j], 10)
for i in range(len(zz_dgms1))] for j in range(len(zz_dgms1))])
dill.dump(zz_gram1, open("generative/zz_gram1.dill", "wb"))
print("done, saved.")
if WRCF_PERS and SW_KERNEL:
print("Sliced Wasserstein Kernel (WRCF)...", end="", flush=True)
wrcf_dgms1 = [dgm[1] for dgm in wrcf_dgms if len(dgm) > 1]
wrcf_gram1 = np.array([[SW_approx(wrcf_dgms1[i], wrcf_dgms1[j], 10)
for i in range(len(wrcf_dgms1))] for j in range(len(wrcf_dgms1))])
dill.dump(wrcf_gram1, open("generative/wrcf_gram1.dill", "wb"))
print("done, saved.")
if ZIGZAG_PERS and BOTTLENECK_DIST:
print("Bottleneck distance (zigzag)...", end="", flush=True)
zz_dgms1 = list(map(remove_inf, zz_dgms1))
zz_distmat = np.array([[d.bottleneck_distance(zz_dgms1[i], zz_dgms1[j])
for i in range(len(zz_dgms1))] for j in range(len(zz_dgms1))])
dill.dump(zz_distmat, open("generative/zz_distmat.dill", "wb"))
print("done, saved.")
if WRCF_PERS and BOTTLENECK_DIST:
print("Bottleneck distance (WRCF)...", end="", flush=True)
wrcf_dgms1 = list(map(remove_inf, wrcf_dgms1))
wrcf_distmat = np.array([[d.bottleneck_distance(wrcf_dgms1[i], wrcf_dgms1[j])
for i in range(len(wrcf_dgms1))] for j in range(len(wrcf_dgms1))])
dill.dump(wrcf_distmat, open("generative/wrcf_distmat.dill", "wb"))
print("done, saved.")

View file

@ -1,8 +1,21 @@
\usepackage{fontspec} \usepackage{fontspec}
\setmainfont{Linux Libertine O} % \setmainfont{Linux Libertine O}
\setsansfont{Linux Biolinum O} % \setsansfont{Linux Biolinum O}
\setmonofont[Scale=0.83]{Inconsolata} % \setmonofont[Scale=0.83]{Inconsolata}
\setmainfont{Libertinus Serif}
\setsansfont{Libertinus Sans}
\setmonofont[Scale=0.95]{Inconsolata}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{unicode-math}
%\setmathfont{Libertinus Math}
\usepackage[ruled]{algorithm2e}
\newcommand*\diff{\mathop{}\!\mathrm{d}}
\usepackage{polyglossia} \usepackage{polyglossia}
\setdefaultlanguage[variant=british]{english} \setdefaultlanguage[variant=british]{english}
@ -10,26 +23,26 @@
\usepackage{graphicx} \usepackage{graphicx}
\usepackage[dvipsnames]{xcolor} \usepackage[dvipsnames]{xcolor}
\usepackage{wrapfig} \usepackage{wrapfig}
\usepackage{caption}
\usepackage{subcaption} \usepackage{subcaption}
\usepackage{lettrine} \usepackage{lettrine}
\usepackage{amssymb, amsmath} \usepackage{thmtools}
\usepackage{amsthm}
\theoremstyle{plain} \theoremstyle{plain}
\newtheorem{thm}{Theorem}[chapter] \newtheorem{thm}{Theorem}[chapter]
\newcommand{\thmautorefname}{theorem} %\newcommand{\thmautorefname}{theorem}
\newtheorem{lem}[thm]{Lemma} \newtheorem{lem}[thm]{Lemma}
\newcommand{\lemautorefname}{lemma} %\newcommand{\lemautorefname}{lemma}
\newtheorem{cor}[thm]{Corollary} \newtheorem{cor}[thm]{Corollary}
\newcommand{\corautorefname}{corollary} %\newcommand{\corautorefname}{corollary}
\newtheorem{prop}[thm]{Proposition} \newtheorem{prop}[thm]{Proposition}
\newcommand{\propautorefname}{proposition} %\newcommand{\propautorefname}{proposition}
\theoremstyle{definition} \theoremstyle{definition}
\newtheorem{defn}{Definition}[chapter] \newtheorem{defn}{Definition}[chapter]
\newcommand{\defnautorefname}{definition} %\newcommand{\defnautorefname}{definition}
\newtheorem{expl}{Example}[chapter] \newtheorem{expl}{Example}[chapter]
\newcommand{\explautorefname}{example} %\newcommand{\explautorefname}{example}
\theoremstyle{remark} \theoremstyle{remark}
\newtheorem*{rem}{Remark} \newtheorem*{rem}{Remark}
\newtheorem*{note}{Note} \newtheorem*{note}{Note}
@ -39,10 +52,11 @@
\usepackage{tikz-network} \usepackage{tikz-network}
\usepackage{tikz} \usepackage{tikz}
\usetikzlibrary{patterns,backgrounds,positioning,chains,lindenmayersystems} \usetikzlibrary{patterns,backgrounds,positioning,chains,lindenmayersystems,intersections}
\usepackage[style=numeric-comp,backref,url=false]{biblatex} \usepackage{csquotes}
\bibliography{TDA,temporalgraphs} \usepackage[backend=biber,style=numeric-comp,backref,url=false,minnames=3]{biblatex}
\bibliography{TDA,temporalgraphs,Other}
\usepackage{pdfpages} \usepackage{pdfpages}
@ -61,9 +75,12 @@
linkcolor=MidnightBlue, linkcolor=MidnightBlue,
filecolor=MidnightBlue, filecolor=MidnightBlue,
urlcolor=MidnightBlue, urlcolor=MidnightBlue,
citecolor=Green citecolor=MidnightBlue
} }
\usepackage{minted}
\setminted{autogobble,fontsize=\footnotesize,linenos,stepnumber=5}
%% Pour la classe memoir /!\ %% Pour la classe memoir /!\
%% Marges %% Marges
@ -72,12 +89,12 @@
\checkandfixthelayout% \checkandfixthelayout%
%% Numérotation des divisions logiques %% Numérotation des divisions logiques
\setsecnumdepth{subsection} \setsecnumdepth{subsubsection}
\maxsecnumdepth{subsection} \maxsecnumdepth{subsubsection}
%% Profondeur de la ToC %% Profondeur de la ToC
\settocdepth{subsection} \settocdepth{subsubsection}
\maxtocdepth{subsection} \maxtocdepth{subsubsection}
%% Style des titres des divisions logiques %% Style des titres des divisions logiques
\setsecheadstyle{\Large\scshape} \setsecheadstyle{\Large\scshape}
@ -95,6 +112,7 @@
%% Couleurs %% Couleurs
%\definecolor{purpletouch}{RGB}{103,30,117} %\definecolor{purpletouch}{RGB}{103,30,117}
\definecolor{bleux}{RGB}{0,62,92} \definecolor{bleux}{RGB}{0,62,92}
\definecolor{OxfordBlue}{RGB}{0,33,71}
\author{Dimitri Lozeve} \author{Dimitri Lozeve}
\date{September 2018} \date{September 2018}

View file

@ -0,0 +1,54 @@
import numpy as np
import dionysus as d
def diagram_array(dgm):
"""Convert a Dionysus diagram to a Numpy array.
:param dgm: Dionysus Diagram
:return: a Numpy array of tuples representing the points in the
diagram.
"""
res = []
for p in dgm:
if p.death != np.inf:
res.append([p.birth, p.death])
return np.array(res)
def SW_approx(dgm1, dgm2, M):
"""Approximate computation of the Sliced Wasserstein kernel.
:param dgm1: first Diagram
:param dgm2: second Diagram
:param M int: number of directions
:return: The approximate value of the Sliced Wasserstein kernel of
dgm1 and dgm2, sampled over M dimensions.
"""
dgm1 = diagram_array(dgm1)
dgm2 = diagram_array(dgm2)
if dgm1.size == 0 or dgm2.size == 0:
return 0
# Add \pi_\delta(dgm1) to dgm2 and vice-versa
proj1 = dgm1.dot([1, 1])/np.sqrt(2)
proj2 = dgm2.dot([1, 1])/np.sqrt(2)
dgm1 = np.vstack((dgm1, np.vstack((proj2, proj2)).T))
dgm2 = np.vstack((dgm2, np.vstack((proj1, proj1)).T))
SW = 0
theta = -np.pi/2
s = np.pi/M
for i in range(M):
# Project each diagram on the direction theta
vec = [1, np.arctan(theta)]
vec = vec / np.linalg.norm(vec)
V1 = dgm1.dot(vec)
V2 = dgm2.dot(vec)
# Sort the projections
V1.sort()
V2.sort()
# l1-distance between the projections
SW = SW + s * np.sum(np.abs(V1 - V2))
theta = theta + s
return 1/np.pi * SW

View file

@ -0,0 +1,106 @@
#!/usr/bin/env python3
import numpy as np
import igraph as ig
import dionysus as d
import multiprocessing
# from dask.distributed import Client
from zigzag import sliding_windows, zigzag_network
from wrcf import wrcf_diagram
from sliced_wasserstein import diagram_array, SW_approx
import dill
def remove_inf(dgm):
"""Remove infinite points in a persistence diagram.
:param dgm: Diagram
:return: the same diagram without the infinite points.
"""
res = d.Diagram()
for p in dgm:
if p.death != np.inf:
res.append(p)
return res
## Global parameters
N_WINDOWS = 40
## Computations
ZIGZAG_PERS = True
WRCF_PERS = True
SW_KERNEL = True
BOTTLENECK_DIST = True
if __name__=="__main__":
print("Loading SocioPatterns dataset...", end="", flush=True)
g = ig.read("data/sociopatterns/infectious/infectious.graphml")
del g.es["id"]
# print(g.summary())
print("done.")
print("Temporal partitioning...", end="", flush=True)
wins = sliding_windows(g, 1/N_WINDOWS)
print("done.")
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
if ZIGZAG_PERS:
print("Zigzag persistence...", end="", flush=True)
zz_dgms = pool.map(zigzag_network, wins)
dill.dump(zz_dgms, open("sociopatterns/zz_dgms.dill", "wb"))
print("done, saved.")
if WRCF_PERS:
print("WRCF...", end="", flush=True)
## Collapse each subnetwork into a static graph: the weight is the
## number of appearances of each edge
for w in wins:
w.es["time"] = np.repeat(1, len(w.es["time"]))
w.simplify(combine_edges="sum")
w.es["weight"] = w.es["time"]
del w.es["time"]
wrcf_dgms = pool.map(wrcf_diagram, wins)
dill.dump(wrcf_dgms, open("sociopatterns/wrcf_dgms.dill", "wb"))
print("done.")
pool.terminate()
if ZIGZAG_PERS and SW_KERNEL:
print("Sliced Wasserstein Kernel (zigzag)...", end="", flush=True)
zz_dgms1 = [dgm[1] for dgm in zz_dgms if len(dgm) > 1]
zz_gram1 = np.array([[SW_approx(zz_dgms1[i], zz_dgms1[j], 10)
for i in range(len(zz_dgms1))] for j in range(len(zz_dgms1))])
dill.dump(zz_gram1, open("sociopatterns/zz_gram1.dill", "wb"))
print("done, saved.")
if WRCF_PERS and SW_KERNEL:
print("Sliced Wasserstein Kernel (WRCF)...", end="", flush=True)
wrcf_dgms1 = [dgm[1] for dgm in wrcf_dgms if len(dgm) > 1]
wrcf_gram1 = np.array([[SW_approx(wrcf_dgms1[i], wrcf_dgms1[j], 10)
for i in range(len(wrcf_dgms1))] for j in range(len(wrcf_dgms1))])
dill.dump(wrcf_gram1, open("sociopatterns/wrcf_gram1.dill", "wb"))
print("done, saved.")
if ZIGZAG_PERS and BOTTLENECK_DIST:
print("Bottleneck distance (zigzag)...", end="", flush=True)
zz_dgms1 = list(map(remove_inf, zz_dgms1))
zz_distmat = np.array([[d.bottleneck_distance(zz_dgms1[i], zz_dgms1[j])
for i in range(len(zz_dgms1))] for j in range(len(zz_dgms1))])
dill.dump(zz_distmat, open("sociopatterns/zz_distmat.dill", "wb"))
print("done, saved.")
if WRCF_PERS and BOTTLENECK_DIST:
print("Bottleneck distance (WRCF)...", end="", flush=True)
wrcf_dgms1 = list(map(remove_inf, wrcf_dgms1))
wrcf_distmat = np.array([[d.bottleneck_distance(wrcf_dgms1[i], wrcf_dgms1[j])
for i in range(len(wrcf_dgms1))] for j in range(len(wrcf_dgms1))])
dill.dump(wrcf_distmat, open("sociopatterns/wrcf_distmat.dill", "wb"))
print("done, saved.")

View file

@ -1,4 +1,36 @@
@article{girvan_community_2002,
title = {Community structure in social and biological networks},
volume = {99},
issn = {0027-8424},
url = {http://arxiv.org/abs/cond-mat/0112110},
doi = {10.1073/pnas.},
abstract = {A number of recent studies have focused on the statistical properties of networked systems such as social networks and the Worldwide Web. Researchers have concentrated particularly on a few properties that seem to be common to many networks: the small-world property, power-law degree distributions, and network transitivity. In this article, we highlight another property that is found in many networks, the property of community structure, in which network nodes are joined together in tightly knit groups, between which there are only looser connections. We propose a method for detecting such communities, built around the idea of using centrality indices to find community boundaries. We test our method on computer-generated and real-world graphs whose community structure is already known and find that the method detects this known structure with high sensitivity and reliability. We also apply the method to two networks whose community structure is not well known-a collaboration network and a food web-and find that it detects significant and informative community divisions in both cases.},
pages = {7821--7826},
number = {12},
journaltitle = {Proceedings of the National Academy of Sciences of United States of America},
author = {Girvan, M. and Newman, M. E. J},
date = {2002},
pmid = {12060727},
keywords = {Models, Neural Networks (Computer), Theoretical, Algorithms, Animals, Community Networks, Computer Simulation, Humans, Nerve Net, Nerve Net: physiology, Social Behavior},
file = {Attachment:/home/dimitri/Zotero/storage/X3C73Q5I/Girvan, Newman - 2002 - Community structure in social and biological networks.pdf:application/pdf}
}
@book{newman_networks:_2010,
location = {Oxford ; New York},
title = {Networks: an introduction},
isbn = {978-0-19-920665-0},
shorttitle = {Networks},
abstract = {"The scientific study of networks, including computer networks, social networks, and biological networks, has received an enormous amount of interest in the last few years. The rise of the Internet and the wide availability of inexpensive computers have made it possible to gather and analyze network data on a large scale, and the development of a variety of new theoretical tools has allowed us to extract new knowledge from many different kinds of networks. The study of networks is broadly interdisciplinary and important developments have occurred in many fields, including mathematics, physics, computer and information sciences, biology, and the social sciences. This book brings together for the first time the most important breakthroughs in each of these fields and presents them in a coherent fashion, highlighting the strong interconnections between work in different areas. Subjects covered include the measurement and structure of networks in many branches of science, methods for analyzing network data, including methods developed in physics, statistics, and sociology, the fundamentals of graph theory, computer algorithms, and spectral methods, mathematical models of networks, including random graph models and generative models, and theories of dynamical processes taking place on networks"--},
pagetotal = {772},
publisher = {Oxford University Press},
author = {Newman, M. E. J.},
date = {2010},
note = {{OCLC}: ocn456837194},
keywords = {Engineering systems, Network analysis (Planning), Social systems, System analysis, Systems biology},
file = {Mark_Newman_Networks_An_Introduction.pdf:/home/dimitri/Zotero/storage/FDMM48IV/Mark_Newman_Networks_An_Introduction.pdf:application/pdf}
}
@article{tabourier_predicting_2016, @article{tabourier_predicting_2016,
title = {Predicting links in ego-networks using temporal information}, title = {Predicting links in ego-networks using temporal information},
volume = {5}, volume = {5},
@ -425,21 +457,206 @@
file = {Snapshot:/home/dimitri/Zotero/storage/CYSLT5MA/10.html:text/html} file = {Snapshot:/home/dimitri/Zotero/storage/CYSLT5MA/10.html:text/html}
} }
@article{tomita_worst-case_2006, @article{isella_whats_2011,
title = {The worst-case time complexity for generating all maximal cliques and computational experiments}, title = {What's in a crowd? Analysis of face-to-face behavioral networks},
volume = {363}, volume = {271},
issn = {0304-3975}, issn = {0022-5193},
url = {http://www.sciencedirect.com/science/article/pii/S0304397506003586}, url = {http://www.sciencedirect.com/science/article/pii/S0022519310006284},
doi = {10.1016/j.tcs.2006.06.015}, doi = {10.1016/j.jtbi.2010.11.033},
series = {Computing and Combinatorics}, shorttitle = {What's in a crowd?},
abstract = {We present a depth-first search algorithm for generating all maximal cliques of an undirected graph, in which pruning methods are employed as in the BronKerbosch algorithm. All the maximal cliques generated are output in a tree-like form. Subsequently, we prove that its worst-case time complexity is O(3n/3) for an n-vertex graph. This is optimal as a function of n, since there exist up to 3n/3 maximal cliques in an n-vertex graph. The algorithm is also demonstrated to run very fast in practice by computational experiments.}, abstract = {The availability of new data sources on human mobility is opening new avenues for investigating the interplay of social networks, human mobility and dynamical processes such as epidemic spreading. Here we analyze data on the time-resolved face-to-face proximity of individuals in large-scale real-world scenarios. We compare two settings with very different properties, a scientific conference and a long-running museum exhibition. We track the behavioral networks of face-to-face proximity, and characterize them from both a static and a dynamic point of view, exposing differences and similarities. We use our data to investigate the dynamics of a susceptibleinfected model for epidemic spreading that unfolds on the dynamical networks of human proximity. The spreading patterns are markedly different for the conference and the museum case, and they are strongly impacted by the causal structure of the network data. A deeper study of the spreading paths shows that the mere knowledge of static aggregated networks would lead to erroneous conclusions about the transmission paths on the dynamical networks.},
pages = {28--42}, pages = {166--180},
number = {1}, number = {1},
journaltitle = {Theoretical Computer Science}, journaltitle = {Journal of Theoretical Biology},
shortjournal = {Theoretical Computer Science}, shortjournal = {Journal of Theoretical Biology},
author = {Tomita, Etsuji and Tanaka, Akira and Takahashi, Haruhisa}, author = {Isella, Lorenzo and Stehlé, Juliette and Barrat, Alain and Cattuto, Ciro and Pinton, Jean-François and Van den Broeck, Wouter},
urldate = {2018-07-31}, urldate = {2018-08-08},
date = {2006-10-25}, date = {2011-02-21},
keywords = {Computational experiments, Enumeration, Maximal cliques, Worst-case time complexity}, keywords = {Complex networks, Behavioral social networks, Dynamic networks, Face-to-face proximity, Information spreading},
file = {ScienceDirect Full Text PDF:/home/dimitri/Zotero/storage/QDLTAXHX/Tomita et al. - 2006 - The worst-case time complexity for generating all .pdf:application/pdf;ScienceDirect Snapshot:/home/dimitri/Zotero/storage/TCJ8J7MV/S0304397506003586.html:text/html} file = {Isella et al. - 2011 - What's in a crowd Analysis of face-to-face behavi.pdf:/home/dimitri/Zotero/storage/56DMKRM7/Isella et al. - 2011 - What's in a crowd Analysis of face-to-face behavi.pdf:application/pdf;ScienceDirect Snapshot:/home/dimitri/Zotero/storage/J4DJF3P8/S0022519310006284.html:text/html}
}
@inproceedings{sulo_meaningful_2010,
location = {New York, {NY}, {USA}},
title = {Meaningful Selection of Temporal Resolution for Dynamic Networks},
isbn = {978-1-4503-0214-2},
url = {http://doi.acm.org/10.1145/1830252.1830269},
doi = {10.1145/1830252.1830269},
series = {{MLG} '10},
abstract = {The understanding of dynamics of data streams is greatly affected by the choice of temporal resolution at which the data are discretized, aggregated, and analyzed. Our paper focuses explicitly on data streams represented as dynamic networks. We propose a framework for identifying meaningful resolution levels that best reveal critical changes in the network structure, by balancing the reduction of noise with the loss of information. We demonstrate the applicability of our approach by analyzing various network statistics of both synthetic and real dynamic networks and using those to detect important events and changes in dynamic network structure.},
pages = {127--136},
booktitle = {Proceedings of the Eighth Workshop on Mining and Learning with Graphs},
publisher = {{ACM}},
author = {Sulo, Rajmonda and Berger-Wolf, Tanya and Grossman, Robert},
urldate = {2018-08-22},
date = {2010}
}
@article{krings_effects_2012,
title = {Effects of time window size and placement on the structure of an aggregated communication network},
volume = {1},
rights = {2012 Krings et al.; licensee Springer.},
issn = {2193-1127},
url = {https://epjdatascience.springeropen.com/articles/10.1140/epjds4},
doi = {10.1140/epjds4},
abstract = {Complex networks are often constructed by aggregating empirical data over time, such that a link represents the existence of interactions between the endpoint nodes and the link weight represents the intensity of such interactions within the aggregation time window. The resulting networks are then often considered static. More often than not, the aggregation time window is dictated by the availability of data, and the effects of its length on the resulting networks are rarely considered. Here, we address this question by studying the structural features of networks emerging from aggregating empirical data over different time intervals, focussing on networks derived from time-stamped, anonymized mobile telephone call records. Our results show that short aggregation intervals yield networks where strong links associated with dense clusters dominate; the seeds of such clusters or communities become already visible for intervals of around one week. The degree and weight distributions are seen to become stationary around a few days and a few weeks, respectively. An aggregation interval of around 30 days results in the stablest similar networks when consecutive windows are compared. For longer intervals, the effects of weak or random links become increasingly stronger, and the average degree of the network keeps growing even for intervals up to 180 days. The placement of the time window is also seen to affect the outcome: for short windows, different behavioural patterns play a role during weekends and weekdays, and for longer windows it is seen that networks aggregated during holiday periods are significantly different.},
pages = {4},
number = {1},
journaltitle = {{EPJ} Data Science},
author = {Krings, Gautier and Karsai, Márton and Bernhardsson, Sebastian and Blondel, Vincent D. and Saramäki, Jari},
urldate = {2018-08-22},
date = {2012-12},
file = {Full Text PDF:/home/dimitri/Zotero/storage/3Y8AHZXA/Krings et al. - 2012 - Effects of time window size and placement on the s.pdf:application/pdf;Snapshot:/home/dimitri/Zotero/storage/AQLGJGDL/epjds4.html:text/html}
}
@article{ribeiro_quantifying_2013,
title = {Quantifying the effect of temporal resolution on time-varying networks},
volume = {3},
rights = {2013 Nature Publishing Group},
issn = {2045-2322},
url = {https://www.nature.com/articles/srep03006},
doi = {10.1038/srep03006},
abstract = {Time-varying networks describe a wide array of systems whose constituents and interactions evolve over time. They are defined by an ordered stream of interactions between nodes, yet they are often represented in terms of a sequence of static networks, each aggregating all edges and nodes present in a time interval of size Δt. In this work we quantify the impact of an arbitrary Δt on the description of a dynamical process taking place upon a time-varying network. We focus on the elementary random walk, and put forth a simple mathematical framework that well describes the behavior observed on real datasets. The analytical description of the bias introduced by time integrating techniques represents a step forward in the correct characterization of dynamical processes on time-varying graphs.},
pages = {3006},
journaltitle = {Scientific Reports},
author = {Ribeiro, Bruno and Perra, Nicola and Baronchelli, Andrea},
urldate = {2018-08-22},
date = {2013-10-21},
langid = {english},
file = {Full Text PDF:/home/dimitri/Zotero/storage/9WPT9TVJ/Ribeiro et al. - 2013 - Quantifying the effect of temporal resolution on t.pdf:application/pdf;Snapshot:/home/dimitri/Zotero/storage/5IKE4WIN/srep03006.html:text/html}
}
@book{fouss_algorithms_2016,
title = {Algorithms and Models for Network Data and Link Analysis},
isbn = {978-1-107-12577-3},
abstract = {Network data are produced automatically by everyday interactions - social networks, power grids, and links between data sets are a few examples. Such data capture social and economic behavior in a form that can be analyzed using powerful computational tools. This book is a guide to both basic and advanced techniques and algorithms for extracting useful information from network data. The content is organized around 'tasks', grouping the algorithms needed to gather specific types of information and thus answer specific types of questions. Examples include similarity between nodes in a network, prestige or centrality of individual nodes, and dense regions or communities in a network. Algorithms are derived in detail and summarized in pseudo-code. The book is intended primarily for computer scientists, engineers, statisticians and physicists, but it is also accessible to network scientists based in the social sciences. {MATLAB}®/Octave code illustrating some of the algorithms will be available at: http://www.cambridge.org/9781107125773.},
pagetotal = {549},
publisher = {Cambridge University Press},
author = {Fouss, François and Saerens, Marco and Shimbo, Masashi},
date = {2016-07-12},
langid = {english},
note = {Google-Books-{ID}: {AUJfDAAAQBAJ}},
keywords = {Computers / Computer Science, Computers / Databases / Data Mining, Computers / Databases / General},
file = {Fouss et al. - 2016 - Algorithms and Models for Network Data and Link An.pdf:/home/dimitri/Zotero/storage/DULGH6PQ/Fouss et al. - 2016 - Algorithms and Models for Network Data and Link An.pdf:application/pdf}
}
@article{cattuto_dynamics_2010,
title = {Dynamics of Person-to-Person Interactions from Distributed {RFID} Sensor Networks},
volume = {5},
issn = {1932-6203},
url = {https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0011596},
doi = {10.1371/journal.pone.0011596},
abstract = {Background Digital networks, mobile devices, and the possibility of mining the ever-increasing amount of digital traces that we leave behind in our daily activities are changing the way we can approach the study of human and social interactions. Large-scale datasets, however, are mostly available for collective and statistical behaviors, at coarse granularities, while high-resolution data on person-to-person interactions are generally limited to relatively small groups of individuals. Here we present a scalable experimental framework for gathering real-time data resolving face-to-face social interactions with tunable spatial and temporal granularities. Methods and Findings We use active Radio Frequency Identification ({RFID}) devices that assess mutual proximity in a distributed fashion by exchanging low-power radio packets. We analyze the dynamics of person-to-person interaction networks obtained in three high-resolution experiments carried out at different orders of magnitude in community size. The data sets exhibit common statistical properties and lack of a characteristic time scale from 20 seconds to several hours. The association between the number of connections and their duration shows an interesting super-linear behavior, which indicates the possibility of defining super-connectors both in the number and intensity of connections. Conclusions Taking advantage of scalability and resolution, this experimental framework allows the monitoring of social interactions, uncovering similarities in the way individuals interact in different contexts, and identifying patterns of super-connector behavior in the community. These results could impact our understanding of all phenomena driven by face-to-face interactions, such as the spreading of transmissible infectious diseases and information.},
pages = {e11596},
number = {7},
journaltitle = {{PLOS} {ONE}},
shortjournal = {{PLOS} {ONE}},
author = {Cattuto, Ciro and Broeck, Wouter Van den and Barrat, Alain and Colizza, Vittoria and Pinton, Jean-François and Vespignani, Alessandro},
urldate = {2018-09-07},
date = {2010-07-15},
langid = {english},
keywords = {Computer networks, Behavior, Behavioral geography, Human mobility, Probability distribution, Radio waves, Statistical data, Statistical distributions},
file = {Full Text PDF:/home/dimitri/Zotero/storage/GFAHQ6F2/Cattuto et al. - 2010 - Dynamics of Person-to-Person Interactions from Dis.pdf:application/pdf;Snapshot:/home/dimitri/Zotero/storage/67R2UX2N/article.html:text/html}
}
@online{noauthor_infectious_2011,
title = {Infectious {SocioPatterns}},
url = {http://www.sociopatterns.org/datasets/infectious-sociopatterns/},
abstract = {A research project that aims to uncover fundamental patterns in social dynamics and coordinated human activity through a data-driven approach.},
titleaddon = {{SocioPatterns}.org},
date = {2011-03-31},
langid = {american},
file = {Snapshot:/home/dimitri/Zotero/storage/VNBHGW9K/infectious-sociopatterns.html:text/html}
}
@online{noauthor_infectious_2011-1,
title = {Infectious {SocioPatterns} dynamic contact networks},
url = {http://www.sociopatterns.org/datasets/infectious-sociopatterns-dynamic-contact-networks/},
abstract = {A research project that aims to uncover fundamental patterns in social dynamics and coordinated human activity through a data-driven approach.},
titleaddon = {{SocioPatterns}.org},
date = {2011-11-28},
langid = {american},
file = {Snapshot:/home/dimitri/Zotero/storage/9YMG2VGK/infectious-sociopatterns-dynamic-contact-networks.html:text/html}
}
@article{holme_attack_2002,
title = {Attack vulnerability of complex networks},
volume = {65},
url = {https://link.aps.org/doi/10.1103/PhysRevE.65.056109},
doi = {10.1103/PhysRevE.65.056109},
abstract = {We study the response of complex networks subject to attacks on vertices and edges. Several existing complex network models as well as real-world networks of scientific collaborations and Internet traffic are numerically investigated, and the network performance is quantitatively measured by the average inverse geodesic length and the size of the largest connected subgraph. For each case of attacks on vertices and edges, four different attacking strategies are used: removals by the descending order of the degree and the betweenness centrality, calculated for either the initial network or the current network during the removal procedure. It is found that the removals by the recalculated degrees and betweenness centralities are often more harmful than the attack strategies based on the initial network, suggesting that the network structure changes as important vertices or edges are removed. Furthermore, the correlation between the betweenness centrality and the degree in complex networks is studied.},
pages = {056109},
number = {5},
journaltitle = {Physical Review E},
shortjournal = {Phys. Rev. E},
author = {Holme, Petter and Kim, Beom Jun and Yoon, Chang No and Han, Seung Kee},
urldate = {2018-09-09},
date = {2002-05-07},
file = {APS Snapshot:/home/dimitri/Zotero/storage/PW8XHWT3/PhysRevE.65.html:text/html}
}
@article{aledavood_digital_2015,
title = {Digital daily cycles of individuals},
volume = {3},
issn = {2296-424X},
url = {https://www.frontiersin.org/articles/10.3389/fphy.2015.00073/full},
doi = {10.3389/fphy.2015.00073},
abstract = {Humans, like almost all animals, are phase-locked to the diurnal cycle. Most of us sleep at night and are active through the day. Because we have evolved to function with this cycle, the circadian rhythm is deeply ingrained and even detectable at the biochemical level. However, within the broader day-night pattern, there are individual differences: e.g., some of us are intrinsically morning-active, while others prefer evenings. In this article, we look at digital daily cycles: circadian patterns of activity viewed through the lens of auto-recorded data of communication and online activity. We begin at the aggregate level, discuss earlier results, and illustrate differences between population-level daily rhythms in different media. Then we move on to the individual level, and show that there is a strong individual-level variation beyond averages: individuals typically have their distinctive daily pattern that persists in time. We conclude by discussing the driving forces behind these signature daily patterns, from personal traits (morningness/eveningness) to variation in activity level and external constraints, and outline possibilities for future research.},
journaltitle = {Frontiers in Physics},
shortjournal = {Front. Phys.},
author = {Aledavood, Talayeh and Lehmann, Sune and Saramäki, Jari},
urldate = {2018-09-09},
date = {2015},
keywords = {circadian rhythms, Digital phenotyping, electronic communication records, individual differences, Mobile phones},
file = {Full Text PDF:/home/dimitri/Zotero/storage/TZP4KMJ4/Aledavood et al. - 2015 - Digital daily cycles of individuals.pdf:application/pdf}
}
@article{aledavood_daily_2015,
title = {Daily Rhythms in Mobile Telephone Communication},
volume = {10},
issn = {1932-6203},
url = {https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0138098},
doi = {10.1371/journal.pone.0138098},
abstract = {Circadian rhythms are known to be important drivers of human activity and the recent availability of electronic records of human behaviour has provided fine-grained data of temporal patterns of activity on a large scale. Further, questionnaire studies have identified important individual differences in circadian rhythms, with people broadly categorised into morning-like or evening-like individuals. However, little is known about the social aspects of these circadian rhythms, or how they vary across individuals. In this study we use a unique 18-month dataset that combines mobile phone calls and questionnaire data to examine individual differences in the daily rhythms of mobile phone activity. We demonstrate clear individual differences in daily patterns of phone calls, and show that these individual differences are persistent despite a high degree of turnover in the individuals social networks. Further, womens calls were longer than mens calls, especially during the evening and at night, and these calls were typically focused on a small number of emotionally intense relationships. These results demonstrate that individual differences in circadian rhythms are not just related to broad patterns of morningness and eveningness, but have a strong social component, in directing phone calls to specific individuals at specific times of day.},
pages = {e0138098},
number = {9},
journaltitle = {{PLOS} {ONE}},
shortjournal = {{PLOS} {ONE}},
author = {Aledavood, Talayeh and López, Eduardo and Roberts, Sam G. B. and Reed-Tsochas, Felix and Moro, Esteban and Dunbar, Robin I. M. and Saramäki, Jari},
urldate = {2018-09-09},
date = {2015-09-21},
langid = {english},
keywords = {Behavior, Cell phones, Circadian rhythms, Emotions, Entropy, Interpersonal relationships, Questionnaires, Social networks},
file = {Full Text PDF:/home/dimitri/Zotero/storage/FFG9S8PK/Aledavood et al. - 2015 - Daily Rhythms in Mobile Telephone Communication.pdf:application/pdf;Snapshot:/home/dimitri/Zotero/storage/MI9VN585/article.html:text/html}
}
@article{holme_network_2003,
title = {Network dynamics of ongoing social relationships},
volume = {64},
issn = {0295-5075},
url = {http://iopscience.iop.org/article/10.1209/epl/i2003-00505-4/meta},
doi = {10.1209/epl/i2003-00505-4},
pages = {427},
number = {3},
journaltitle = {{EPL} (Europhysics Letters)},
shortjournal = {{EPL}},
author = {Holme, P.},
urldate = {2018-09-09},
date = {2003-11},
langid = {english},
file = {Snapshot:/home/dimitri/Zotero/storage/5IXF7A2B/meta.html:text/html}
}
@article{jo_circadian_2012,
title = {Circadian pattern and burstiness in mobile phone communication},
volume = {14},
issn = {1367-2630},
url = {http://stacks.iop.org/1367-2630/14/i=1/a=013055?key=crossref.49fc43f1e121d47657c8da6f05484442},
doi = {10.1088/1367-2630/14/1/013055},
pages = {013055},
number = {1},
journaltitle = {New Journal of Physics},
author = {Jo, Hang-Hyun and Karsai, Márton and Kertész, János and Kaski, Kimmo},
urldate = {2018-09-09},
date = {2012-01-25}
} }

44
dissertation/wrcf.py Normal file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
import numpy as np
import igraph as ig
import dionysus as d
def wrcf(G, weight="weight"):
"""Compute the weight-rank clique filtration (WRCF) of a graph.
:param G: igraph Graph
:param weight: name of the weight attribute
:return: a Dionysus filtration.
"""
# Define filtration step 0 as the set of all nodes
filt = d.Filtration()
for v in G.vs:
filt.append(d.Simplex([v.index], 0))
# Rank all edge weights
distinct_weights = np.unique(G.es[weight])[::-1]
for t, w in enumerate(distinct_weights):
# At filtration step t, threshold the graph at weight[t]
subg = G.subgraph_edges(G.es(lambda e: e[weight] >= w))
# Find all maximal cliques and define them to be simplices
for clique in subg.maximal_cliques():
for s in d.closure([d.Simplex(clique)], len(clique)):
filt.append(d.Simplex(s, t+1))
filt.sort()
return(filt)
def wrcf_diagram(graph, weight="weight"):
"""Compute persistence diagrams of a graph using WRCF.
:param graph: igraph Graph
:param weight: name of the weight attribute
:return: a list of persistence diagrams.
"""
filt = wrcf(graph, weight=weight)
pers = d.homology_persistence(filt)
dgms = d.init_diagrams(pers, filt)
return dgms

75
dissertation/zigzag.py Normal file
View file

@ -0,0 +1,75 @@
#!/usr/bin/env python3
import numpy as np
import igraph as ig
import dionysus as d
def sliding_windows(g, res=0.1, overlap=0):
"""Compute subnetworks of a temporal network based on temporal
partitioning of the time range.
:param g: igraph Graph
:param res: resolution
:param overlap: overlap
:return: a list of temporal networks.
"""
times = np.array(g.es["time"])
duration = res * (times.max() - times.min())
windows = []
for i in range(int(1/res)):
edges = g.es.select(time_gt=times.min() + duration*i,
time_lt=times.min() + duration*(i+1))
windows.append(g.subgraph_edges(edges))
return windows
def max_simplicial_complex(g):
"""Return the maximal simplicial complex of a network g.
"""
return d.Filtration(g.maximal_cliques())
def find_transitions(a):
"""Find the transition times in an array of presence times.
"""
res = []
prev = False
for i, cur in enumerate(a):
if cur != prev:
res.append(i)
prev = cur
return res
def presence_times(g):
"""Compute the data required to compute zigzag persistence:
simplicial complex and transition times.
:param g: igraph Graph
:return: a tuple with the maximum simplicial complex and the
transition times of each simplex.
"""
max_simplicial_complex = d.Filtration(g.cliques())
filts = []
for t in np.sort(np.unique(g.es["time"])):
edges = g.es.select(time_eq=t)
cliques = g.subgraph_edges(edges).cliques()
filts.append(d.Filtration(cliques))
presences = [[s in filt for filt in filts] for s in max_simplicial_complex]
presences = [find_transitions(p) for p in presences]
return (max_simplicial_complex, presences)
def zigzag_network(g):
"""Compute zigzag persistence on a temporal network.
:param g: igraph Graph
:return: a list of persistence diagrams.
"""
(f, t) = presence_times(g)
_, dgms, _ = d.zigzag_homology_persistence(f, t)
return dgms