diff --git a/bib/all.bib b/bib/all.bib
index 3fa535c..14f54d2 100644
--- a/bib/all.bib
+++ b/bib/all.bib
@@ -623,7 +623,7 @@
   abstract = {Modern mobile devices have access to a wealth of data suitable for learning models, which in turn can greatly improve the user experience on the device. For example, language models can improve speech recognition and text entry, and image models can automatically select good photos. However, this rich data is often privacy sensitive, large in quantity, or both, which may preclude logging to the data center and training there using conventional approaches. We advocate an alternative that leaves the training data distributed on the mobile devices, and learns a shared model by aggregating locally-computed updates. We term this decentralized approach Federated Learning. We present a practical method for the federated learning of deep networks based on iterative model averaging, and conduct an extensive empirical evaluation, considering five different model architectures and four datasets. These experiments demonstrate the approach is robust to the unbalanced and non-IID data distributions that are a defining characteristic of this setting. Communication costs are the principal constraint, and we show a reduction in required communication rounds by 10-100x as compared to synchronized stochastic gradient descent.},
   booktitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   date = {2016},
-  author = {McMahan, H. Brendan and Moore, Eider and Ramage, Daniel and Hampson, Seth and y Arcas, Blaise Agüera},
+  author = {McMahan, H. Brendan and Moore, Eider and Ramage, Daniel and Hampson, Seth and family=Arcas, given=Blaise Agüera, prefix=y, useprefix=false},
   file = {/home/dimitri/Nextcloud/Zotero/storage/H359CB6E/McMahan et al. - 2016 - Communication-Efficient Learning of Deep Networks from Decentralized Data.pdf}
 }
 
@@ -1126,7 +1126,7 @@
   date = {2008},
   pages = {17},
   keywords = {080734315,10,1137,2008,62h17,82-08,91-08,91d30,accepted for publication,ams subject classifications,august,community structure,contingency tables,doi,in revised form,networks,pair counting,received by the editors,september 4},
-  author = {Traud, Amanda L. and Kelsic, Eric D. and Mucha, Peter J. and a. Porter, Mason},
+  author = {Traud, Amanda L. and Kelsic, Eric D. and Mucha, Peter J. and family=Porter, given=Mason, prefix=a., useprefix=false},
   file = {/home/dimitri/Nextcloud/Zotero/storage/7NA4GN3X/Traud et al. - 2008 - Comparing Community Structure to Characteristics in Online Collegiate Social Networks.pdf}
 }
 
@@ -3354,7 +3354,7 @@
   publisher = {{Cambridge University Press}},
   date = {2017},
   keywords = {Random graphs},
-  author = {van der Hofstad, Remco},
+  author = {family=Hofstad, given=Remco, prefix=van der, useprefix=false},
   file = {/home/dimitri/Nextcloud/Zotero/storage/D8JSABD6/NotesRGCN.pdf}
 }
 
@@ -4351,8 +4351,7 @@ properties divide weighted networks in two broad classes: one is characterized b
   date = {2009},
   pages = {247--256},
   keywords = {algorithms,extended persistence,levelset zigzag,Mayer-Vietoris pyramid,zigzag persistent homology},
-  author = {Carlsson, Gunnar and de Silva, Vin and Morozov, Dmitriy},
-  options = {useprefix=true},
+  author = {Carlsson, Gunnar and family=Silva, given=Vin, prefix=de, useprefix=true and Morozov, Dmitriy},
   file = {/home/dimitri/Nextcloud/Zotero/storage/WNIUXA7Y/Carlsson et al. - 2009 - Zigzag Persistent Homology and Real-valued Functio.pdf}
 }
 
@@ -4818,7 +4817,7 @@ novel application of the discriminatory power of PIs.},
   urldate = {2018-07-31},
   date = {2014-12-01},
   pages = {193-214},
-  author = {Chazal, Frédéric and de Silva, Vin and Oudot, Steve},
+  author = {Chazal, Frédéric and family=Silva, given=Vin, prefix=de, useprefix=false and Oudot, Steve},
   file = {/home/dimitri/Nextcloud/Zotero/storage/7EESRFL3/s10711-013-9937-z.html}
 }
 
@@ -5073,8 +5072,7 @@ temporal network itself.},
   date = {2011-06-01},
   pages = {737-759},
   keywords = {Persistent homology,Computational topology,Dimensionality reduction,Persistent cohomology},
-  author = {de Silva, Vin and Morozov, Dmitriy and Vejdemo-Johansson, Mikael},
-  options = {useprefix=true},
+  author = {family=Silva, given=Vin, prefix=de, useprefix=true and Morozov, Dmitriy and Vejdemo-Johansson, Mikael},
   file = {/home/dimitri/Nextcloud/Zotero/storage/EX9L3F7F/de Silva et al. - 2011 - Persistent Cohomology and Circular Coordinates.pdf}
 }
 
@@ -5089,8 +5087,7 @@ temporal network itself.},
   urldate = {2018-09-05},
   date = {2011-12-01},
   pages = {124003},
-  author = {de Silva, Vin and Morozov, Dmitriy and Vejdemo-Johansson, Mikael},
-  options = {useprefix=true}
+  author = {family=Silva, given=Vin, prefix=de, useprefix=true and Morozov, Dmitriy and Vejdemo-Johansson, Mikael}
 }
 
 @incollection{bauerDistributedComputationPersistent2014,
@@ -5145,7 +5142,7 @@ temporal network itself.},
   date = {2010-07-15},
   pages = {e11596},
   keywords = {Computer networks,Behavior,Behavioral geography,Human mobility,Probability distribution,Radio waves,Statistical data,Statistical distributions},
-  author = {Cattuto, Ciro and den Broeck, Wouter Van and Barrat, Alain and Colizza, Vittoria and Pinton, Jean-François and Vespignani, Alessandro},
+  author = {Cattuto, Ciro and family=Broeck, given=Wouter Van, prefix=den, useprefix=false and Barrat, Alain and Colizza, Vittoria and Pinton, Jean-François and Vespignani, Alessandro},
   file = {/home/dimitri/Nextcloud/Zotero/storage/GFAHQ6F2/Cattuto et al. - 2010 - Dynamics of Person-to-Person Interactions from Dis.pdf;/home/dimitri/Nextcloud/Zotero/storage/67R2UX2N/article.html}
 }
 
@@ -5180,8 +5177,7 @@ temporal network itself.},
   urldate = {2018-09-08},
   date = {2008-11-30},
   keywords = {Computer Science - Computational Geometry,I.3.5},
-  author = {Carlsson, Gunnar and de Silva, Vin},
-  options = {useprefix=true},
+  author = {Carlsson, Gunnar and family=Silva, given=Vin, prefix=de, useprefix=true},
   file = {/home/dimitri/Nextcloud/Zotero/storage/PKSM89FF/Carlsson and de Silva - 2008 - Zigzag Persistence.pdf;/home/dimitri/Nextcloud/Zotero/storage/QF37EI5F/0812.html}
 }
 
@@ -5650,8 +5646,7 @@ temporal network itself.},
   urldate = {2019-01-05},
   date = {2018-09-27},
   keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Programming Languages},
-  author = {van de Meent, Jan-Willem and Paige, Brooks and Yang, Hongseok and Wood, Frank},
-  options = {useprefix=true},
+  author = {family=Meent, given=Jan-Willem, prefix=van de, useprefix=true and Paige, Brooks and Yang, Hongseok and Wood, Frank},
   file = {/home/dimitri/Nextcloud/Zotero/storage/P4QUYCRF/van de Meent et al. - 2018 - An Introduction to Probabilistic Programming.pdf;/home/dimitri/Nextcloud/Zotero/storage/J4HQPMDM/1809.html}
 }
 
@@ -5668,7 +5663,7 @@ temporal network itself.},
   date = {2017-11-28},
   keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Mathematics - Category Theory},
   author = {Fong, Brendan and Spivak, David I. and Tuyéras, Rémy},
-  file = {/home/dimitri/Nextcloud/Zotero/storage/DWY4XRXD/Fong et al. - 2017 - Backprop as Functor A compositional perspective o.pdf;/home/dimitri/Nextcloud/Zotero/storage/VJ8FE8JF/1711.html}
+  file = {/home/dimitri/Nextcloud/Zotero/storage/2WBW6WCN/Fong et al. - 2017 - Backprop as Functor A compositional perspective o.pdf;/home/dimitri/Nextcloud/Zotero/storage/SSIRE6JS/1711.html}
 }
 
 @article{chenNeuralOrdinaryDifferential2018,
@@ -6653,8 +6648,7 @@ temporal network itself.},
   urldate = {2019-02-04},
   date = {2019-01-31},
   keywords = {Statistics - Machine Learning,Computer Science - Computation and Language,Computer Science - Machine Learning},
-  author = {Yogatama, Dani and d' Autume, Cyprien de Masson and Connor, Jerome and Kocisky, Tomas and Chrzanowski, Mike and Kong, Lingpeng and Lazaridou, Angeliki and Ling, Wang and Yu, Lei and Dyer, Chris and Blunsom, Phil},
-  options = {useprefix=true},
+  author = {Yogatama, Dani and family=Autume, given=Cyprien de Masson, prefix=d', useprefix=true and Connor, Jerome and Kocisky, Tomas and Chrzanowski, Mike and Kong, Lingpeng and Lazaridou, Angeliki and Ling, Wang and Yu, Lei and Dyer, Chris and Blunsom, Phil},
   file = {/home/dimitri/Nextcloud/Zotero/storage/FP7K77IR/Yogatama et al. - 2019 - Learning and Evaluating General Linguistic Intelli.pdf;/home/dimitri/Nextcloud/Zotero/storage/8V7JCZPB/1901.html}
 }
 
@@ -6792,7 +6786,7 @@ temporal network itself.},
   publisher = {{Springer}},
   date = {2018},
   author = {Korte, Bernhard and Vygen, Jens},
-  file = {/home/dimitri/Nextcloud/Zotero/storage/5XEKT6AH/Korte and Vygen - 2018 - Combinatorial optimization theory and algorithms.pdf;/home/dimitri/Nextcloud/Zotero/storage/M7U27QRQ/Bernhard Korte, Jens Vygen - Combinatorial Optimization. Theory and Algorithms [6th ed.]-Springer (2018).pdf},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/M7U27QRQ/Bernhard Korte, Jens Vygen - Combinatorial Optimization. Theory and Algorithms [6th ed.]-Springer (2018).pdf},
   note = {OCLC: 1011040795}
 }
 
@@ -7010,7 +7004,7 @@ temporal network itself.},
   date = {2019},
   pages = {355-206},
   author = {Peyré, Gabriel and Cuturi, Marco},
-  file = {/home/dimitri/Nextcloud/Zotero/storage/GFYFNBMZ/Peyré and Cuturi - 2019 - Computational Optimal Transport.pdf}
+  file = {/home/dimitri/Nextcloud/Zotero/storage/GLNYIRM9/Peyré and Cuturi - 2019 - Computational Optimal Transport.pdf}
 }
 
 @article{jainNonconvexOptimizationMachine2017,
@@ -7176,7 +7170,7 @@ temporal network itself.},
   date = {2019-02-21},
   keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
   author = {Gabella, Maxime and Afambo, Nitya and Ebli, Stefania and Spreemann, Gard},
-  file = {/home/dimitri/Nextcloud/Zotero/storage/NPS4RRH6/Gabella et al. - 2019 - Topology of Learning in Artificial Neural Networks.pdf;/home/dimitri/Nextcloud/Zotero/storage/PWI3E5VR/1902.html}
+  file = {/home/dimitri/Nextcloud/Zotero/storage/LMRXQ7UH/Gabella et al. - 2019 - Topology of Learning in Artificial Neural Networks.pdf}
 }
 
 @article{espadotoDeepLearningMultidimensional2019,
@@ -8362,9 +8356,8 @@ temporal network itself.},
   date = {2004},
   pages = {63-71},
   author = {Rasmussen, Carl Edward},
-  editor = {Bousquet, Olivier and von Luxburg, Ulrike and Rätsch, Gunnar},
-  options = {useprefix=true},
-  file = {/home/dimitri/Nextcloud/Zotero/storage/KSCYH3JC/rasmussen2004.pdf},
+  editor = {Bousquet, Olivier and family=Luxburg, given=Ulrike, prefix=von, useprefix=true and Rätsch, Gunnar},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/KSCYH3JC/rasmussen2004.pdf;/home/dimitri/Nextcloud/Zotero/storage/T22JWMGV/Rasmussen - 2004 - Gaussian Processes in Machine Learning.pdf},
   doi = {10.1007/978-3-540-28650-9_4}
 }
 
@@ -8483,7 +8476,7 @@ temporal network itself.},
   urldate = {2019-03-29},
   date = {2019-03-27},
   keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
-  author = {Ozair, Sherjil and Lynch, Corey and Bengio, Yoshua and van den Oord, Aaron and Levine, Sergey and Sermanet, Pierre},
+  author = {Ozair, Sherjil and Lynch, Corey and Bengio, Yoshua and family=Oord, given=Aaron, prefix=van den, useprefix=false and Levine, Sergey and Sermanet, Pierre},
   file = {/home/dimitri/Nextcloud/Zotero/storage/ZYQH2Y9K/Ozair et al. - 2019 - Wasserstein Dependency Measure for Representation .pdf;/home/dimitri/Nextcloud/Zotero/storage/PPDX5S4W/1903.html}
 }
 
@@ -8560,7 +8553,7 @@ temporal network itself.},
   urldate = {2019-04-01},
   date = {2018-07-10},
   keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
-  author = {van den Oord, Aaron and Li, Yazhe and Vinyals, Oriol},
+  author = {family=Oord, given=Aaron, prefix=van den, useprefix=false and Li, Yazhe and Vinyals, Oriol},
   file = {/home/dimitri/Nextcloud/Zotero/storage/DNB5PFBL/Oord et al. - 2018 - Representation Learning with Contrastive Predictiv.pdf;/home/dimitri/Nextcloud/Zotero/storage/FW23WSJQ/1807.html}
 }
 
@@ -8636,5 +8629,2013 @@ temporal network itself.},
   file = {/home/dimitri/Nextcloud/Zotero/storage/VFCRFKBW/Liu et al. - 2019 - Linguistic Knowledge and Transferability of Contex.pdf;/home/dimitri/Nextcloud/Zotero/storage/BQTBRD84/1903.html}
 }
 
-@preamble{ "\ifdefined\DeclarePrefChars\DeclarePrefChars{'’-}\else\fi " }
+@inproceedings{meloGaussianProcessesRegression2012,
+  title = {Gaussian {{Processes}} for Regression : A Tutorial},
+  shorttitle = {Gaussian {{Processes}} for Regression},
+  abstract = {Gaussian processes are a powerful, non-parametric tool that can be be used in supervised learning, namely in regression but also in classification problems. The main advantages of this method are the ability of GPs to provide uncertainty estimates and to learn the noise and smoothness parameters from training data. The aim of this short tutorial is to provide the basic theoretical aspects of Gaussian Processes, as well as a brief practical overview on implementation. The main motivation of this work was to develop a new approach to detect outliers on acoustic navigation algorithms for Autonomous Underwater Vehicles, capable of adjusting to different operation scenarios, since this is a major problem in the majority of Autonomous Underwater Vehicles. In the last part of the tutorial, a brief insight on this actual problem, and the solution proposed, that involves Gaussian Processes as a predictor, and some background subtraction techniques is described.},
+  date = {2012},
+  keywords = {Acoustic cryptanalysis,Activation function,Algorithm,Approximation,Background subtraction,Effective method,Estimated,Expectation propagation,Gaussian process,Kalman filter,Kerrison Predictor,Normal Statistical Distribution,Subtraction Technique,Supervised learning},
+  author = {Melo, José},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/2BMFEMFB/Melo - 2012 - Gaussian Processes for regression  a tutorial.pdf}
+}
+
+@article{damianouDeepGaussianProcesses2012,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1211.0358},
+  primaryClass = {cs, math, stat},
+  title = {Deep {{Gaussian Processes}}},
+  url = {http://arxiv.org/abs/1211.0358},
+  abstract = {In this paper we introduce deep Gaussian process (GP) models. Deep GPs are a deep belief network based on Gaussian process mappings. The data is modeled as the output of a multivariate GP. The inputs to that Gaussian process are then governed by another GP. A single layer model is equivalent to a standard GP or the GP latent variable model (GP-LVM). We perform inference in the model by approximate variational marginalization. This results in a strict lower bound on the marginal likelihood of the model which we use for model selection (number of layers and nodes per layer). Deep belief networks are typically applied to relatively large data sets using stochastic gradient descent for optimization. Our fully Bayesian treatment allows for the application of deep models even when data is scarce. Model selection by our variational bound shows that a five layer hierarchy is justified even when modelling a digit data set containing only 150 examples.},
+  urldate = {2019-04-04},
+  date = {2012-11-01},
+  keywords = {Statistics - Machine Learning,Mathematics - Probability,Computer Science - Machine Learning,I.2.6,60G15; 58E30,G.1.2,G.3},
+  author = {Damianou, Andreas C. and Lawrence, Neil D.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/FCZ2J52I/Damianou and Lawrence - 2012 - Deep Gaussian Processes.pdf;/home/dimitri/Nextcloud/Zotero/storage/EPABC27J/1211.html}
+}
+
+@article{leeDeepNeuralNetworks2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1711.00165},
+  primaryClass = {cs, stat},
+  title = {Deep {{Neural Networks}} as {{Gaussian Processes}}},
+  url = {http://arxiv.org/abs/1711.00165},
+  abstract = {It has long been known that a single-layer fully-connected neural network with an i.i.d. prior over its parameters is equivalent to a Gaussian process (GP), in the limit of infinite network width. This correspondence enables exact Bayesian inference for infinite width neural networks on regression tasks by means of evaluating the corresponding GP. Recently, kernel functions which mimic multi-layer random neural networks have been developed, but only outside of a Bayesian framework. As such, previous work has not identified that these kernels can be used as covariance functions for GPs and allow fully Bayesian prediction with a deep neural network. In this work, we derive the exact equivalence between infinitely wide deep networks and GPs. We further develop a computationally efficient pipeline to compute the covariance function for these GPs. We then use the resulting GPs to perform Bayesian inference for wide deep neural networks on MNIST and CIFAR-10. We observe that trained neural network accuracy approaches that of the corresponding GP with increasing layer width, and that the GP uncertainty is strongly correlated with trained network prediction error. We further find that test performance increases as finite-width trained networks are made wider and more similar to a GP, and thus that GP predictions typically outperform those of finite-width networks. Finally we connect the performance of these GPs to the recent theory of signal propagation in random neural networks.},
+  urldate = {2019-04-04},
+  date = {2017-10-31},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Lee, Jaehoon and Bahri, Yasaman and Novak, Roman and Schoenholz, Samuel S. and Pennington, Jeffrey and Sohl-Dickstein, Jascha},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/DE6S84PZ/Lee et al. - 2017 - Deep Neural Networks as Gaussian Processes.pdf;/home/dimitri/Nextcloud/Zotero/storage/RSNF5RDD/1711.html}
+}
+
+@article{sileoCompositionSentenceEmbeddings2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.02464},
+  primaryClass = {cs},
+  title = {Composition of {{Sentence Embeddings}}:{{Lessons}} from {{Statistical Relational Learning}}},
+  url = {http://arxiv.org/abs/1904.02464},
+  shorttitle = {Composition of {{Sentence Embeddings}}},
+  abstract = {Various NLP problems -- such as the prediction of sentence similarity, entailment, and discourse relations -- are all instances of the same general task: the modeling of semantic relations between a pair of textual elements. A popular model for such problems is to embed sentences into fixed size vectors, and use composition functions (e.g. concatenation or sum) of those vectors as features for the prediction. At the same time, composition of embeddings has been a main focus within the field of Statistical Relational Learning (SRL) whose goal is to predict relations between entities (typically from knowledge base triples). In this article, we show that previous work on relation prediction between texts implicitly uses compositions from baseline SRL models. We show that such compositions are not expressive enough for several tasks (e.g. natural language inference). We build on recent SRL models to address textual relational problems, showing that they are more expressive, and can alleviate issues from simpler compositions. The resulting models significantly improve the state of the art in both transferable sentence representation learning and relation prediction.},
+  urldate = {2019-04-05},
+  date = {2019-04-04},
+  keywords = {Computer Science - Computation and Language},
+  author = {Sileo, Damien and Van-De-Cruys, Tim and Pradel, Camille and Muller, Philippe},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/H8RDMLJF/Sileo et al. - 2019 - Composition of Sentence EmbeddingsLessons from St.pdf;/home/dimitri/Nextcloud/Zotero/storage/AP9TBIRV/1904.html}
+}
+
+@article{jainAttentionNotExplanation2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1902.10186},
+  primaryClass = {cs},
+  title = {Attention Is Not {{Explanation}}},
+  url = {http://arxiv.org/abs/1902.10186},
+  abstract = {Attention mechanisms have seen wide adoption in neural NLP models. In addition to improving predictive performance, these are often touted as affording transparency: models equipped with attention provide a distribution over attended-to input units, and this is often presented (at least implicitly) as communicating the relative importance of inputs. However, it is unclear what relationship exists between attention weights and model outputs. In this work, we perform extensive experiments across a variety of NLP tasks that aim to assess the degree to which attention weights provide meaningful `explanations' for predictions. We find that they largely do not. For example, learned attention weights are frequently uncorrelated with gradient-based measures of feature importance, and one can identify very different attention distributions that nonetheless yield equivalent predictions. Our findings show that standard attention modules do not provide meaningful explanations and should not be treated as though they do. Code for all experiments is available at https://github.com/successar/AttentionExplanation.},
+  urldate = {2019-04-05},
+  date = {2019-02-26},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+  author = {Jain, Sarthak and Wallace, Byron C.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/3K7ULXYM/Jain and Wallace - 2019 - Attention is not Explanation.pdf;/home/dimitri/Nextcloud/Zotero/storage/K7N98VCP/1902.html}
+}
+
+@article{hanUnsupervisedDomainAdaptation2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.02817},
+  primaryClass = {cs},
+  title = {Unsupervised {{Domain Adaptation}} of {{Contextualized Embeddings}}: {{A Case Study}} in {{Early Modern English}}},
+  url = {http://arxiv.org/abs/1904.02817},
+  shorttitle = {Unsupervised {{Domain Adaptation}} of {{Contextualized Embeddings}}},
+  abstract = {Contextualized word embeddings such as ELMo and BERT provide a foundation for strong performance across a range of natural language processing tasks, in part by pretraining on a large and topically-diverse corpus. However, the applicability of this approach is unknown when the target domain varies substantially from the text used during pretraining. Specifically, we are interested the scenario in which labeled data is available in only a canonical source domain such as newstext, and the target domain is distinct from both the labeled corpus and the pretraining data. To address this scenario, we propose domain-adaptive fine-tuning, in which the contextualized embeddings are adapted by masked language modeling on the target domain. We test this approach on the challenging domain of Early Modern English, which differs substantially from existing pretraining corpora. Domain-adaptive fine-tuning yields an improvement of 4\textbackslash\% in part-of-speech tagging accuracy over a BERT baseline, substantially improving on prior work on this task.},
+  urldate = {2019-04-08},
+  date = {2019-04-04},
+  keywords = {Computer Science - Digital Libraries,Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Han, Xiaochuang and Eisenstein, Jacob},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/RT8FLXP4/Han and Eisenstein - 2019 - Unsupervised Domain Adaptation of Contextualized E.pdf;/home/dimitri/Nextcloud/Zotero/storage/4B9LSUQF/1904.html}
+}
+
+@article{dziriEvaluatingCoherenceDialogue2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.03371},
+  primaryClass = {cs},
+  title = {Evaluating {{Coherence}} in {{Dialogue Systems}} Using {{Entailment}}},
+  url = {http://arxiv.org/abs/1904.03371},
+  abstract = {Evaluating open-domain dialogue systems is difficult due to the diversity of possible correct answers. Automatic metrics such as BLEU correlate weakly with human annotations, resulting in a significant bias across different models and datasets. Some researchers resort to human judgment experimentation for assessing response quality, which is expensive, time consuming, and not scalable. Moreover, judges tend to evaluate a small number of dialogues, meaning that minor differences in evaluation configuration may lead to dissimilar results. In this paper, we present interpretable metrics for evaluating topic coherence by making use of distributed sentence representations. Furthermore, we introduce calculable approximations of human judgment based on conversational coherence by adopting state-of-the-art entailment techniques. Results show that our metrics can be used as a surrogate for human judgment, making it easy to evaluate dialogue systems on large-scale datasets and allowing an unbiased estimate for the quality of the responses.},
+  urldate = {2019-04-09},
+  date = {2019-04-06},
+  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Dziri, Nouha and Kamalloo, Ehsan and Mathewson, Kory W. and Zaiane, Osmar},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/MVHTK4YK/Dziri et al. - 2019 - Evaluating Coherence in Dialogue Systems using Ent.pdf;/home/dimitri/Nextcloud/Zotero/storage/BCUIVB6S/1904.html}
+}
+
+@article{shevlinApplyRichPsychological2019,
+  langid = {english},
+  title = {Apply Rich Psychological Terms in {{AI}} with Care},
+  volume = {1},
+  issn = {2522-5839},
+  url = {https://www.nature.com/articles/s42256-019-0039-y},
+  doi = {10.1038/s42256-019-0039-y},
+  abstract = {There is much to be gained from interdisciplinary efforts to tackle complex psychological notions such as ‘theory of mind’. However, careful and consistent communication is essential when comparing artificial and biological intelligence, say Henry Shevlin and Marta Halina.},
+  number = {4},
+  journaltitle = {Nature Machine Intelligence},
+  urldate = {2019-04-09},
+  date = {2019-04},
+  pages = {165},
+  author = {Shevlin, Henry and Halina, Marta},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/QDYI99KV/Shevlin and Halina - 2019 - Apply rich psychological terms in AI with care.pdf;/home/dimitri/Nextcloud/Zotero/storage/BI9FKKDU/s42256-019-0039-y.html}
+}
+
+@article{richLessonsArtificialIntelligence2019,
+  langid = {english},
+  title = {Lessons for Artificial Intelligence from the Study of Natural Stupidity},
+  volume = {1},
+  issn = {2522-5839},
+  url = {http://www.nature.com/articles/s42256-019-0038-z},
+  doi = {10.1038/s42256-019-0038-z},
+  number = {4},
+  journaltitle = {Nature Machine Intelligence},
+  urldate = {2019-04-10},
+  date = {2019-04},
+  pages = {174-180},
+  author = {Rich, Alexander S. and Gureckis, Todd M.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/3C4SDZWS/Rich and Gureckis - 2019 - Lessons for artificial intelligence from the study.pdf}
+}
+
+@article{whitakerCharacterizingImpactGeometric2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.04866},
+  primaryClass = {cs},
+  title = {Characterizing the Impact of Geometric Properties of Word Embeddings on Task Performance},
+  url = {http://arxiv.org/abs/1904.04866},
+  abstract = {Analysis of word embedding properties to inform their use in downstream NLP tasks has largely been studied by assessing nearest neighbors. However, geometric properties of the continuous feature space contribute directly to the use of embedding features in downstream models, and are largely unexplored. We consider four properties of word embedding geometry, namely: position relative to the origin, distribution of features in the vector space, global pairwise distances, and local pairwise distances. We define a sequence of transformations to generate new embeddings that expose subsets of these properties to downstream models and evaluate change in task performance to understand the contribution of each property to NLP models. We transform publicly available pretrained embeddings from three popular toolkits (word2vec, GloVe, and FastText) and evaluate on a variety of intrinsic tasks, which model linguistic information in the vector space, and extrinsic tasks, which use vectors as input to machine learning models. We find that intrinsic evaluations are highly sensitive to absolute position, while extrinsic tasks rely primarily on local similarity. Our findings suggest that future embedding models and post-processing techniques should focus primarily on similarity to nearby points in vector space.},
+  urldate = {2019-04-11},
+  date = {2019-04-09},
+  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Whitaker, Brendan and Newman-Griffis, Denis and Haldar, Aparajita and Ferhatosmanoglu, Hakan and Fosler-Lussier, Eric},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/XFEKYZAI/Whitaker et al. - 2019 - Characterizing the impact of geometric properties .pdf;/home/dimitri/Nextcloud/Zotero/storage/NZYCGS9F/1904.html}
+}
+
+@article{wangBERTHasMouth2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1902.04094},
+  primaryClass = {cs},
+  title = {{{BERT}} Has a {{Mouth}}, and {{It Must Speak}}: {{BERT}} as a {{Markov Random Field Language Model}}},
+  url = {http://arxiv.org/abs/1902.04094},
+  shorttitle = {{{BERT}} Has a {{Mouth}}, and {{It Must Speak}}},
+  abstract = {We show that BERT (Devlin et al., 2018) is a Markov random field language model. This formulation gives way to a natural procedure to sample sentences from BERT. We generate from BERT and find that it can produce high-quality, fluent generations. Compared to the generations of a traditional left-to-right language model, BERT generates sentences that are more diverse but of slightly worse quality.},
+  urldate = {2019-04-11},
+  date = {2019-02-11},
+  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Wang, Alex and Cho, Kyunghyun},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/S8QLHLQN/Wang and Cho - 2019 - BERT has a Mouth, and It Must Speak BERT as a Mar.pdf;/home/dimitri/Nextcloud/Zotero/storage/7JNBY733/1902.html}
+}
+
+@article{wangFewshotLearningSurvey2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.05046},
+  primaryClass = {cs},
+  title = {Few-Shot {{Learning}}: {{A Survey}}},
+  url = {http://arxiv.org/abs/1904.05046},
+  shorttitle = {Few-Shot {{Learning}}},
+  abstract = {The quest of `can machines think' and `can machines do what human do' are quests that drive the development of artificial intelligence. Although recent artificial intelligence succeeds in many data intensive applications, it still lacks the ability of learning from limited exemplars and fast generalizing to new tasks. To tackle this problem, one has to turn to machine learning, which supports the scientific study of artificial intelligence. Particularly, a machine learning problem called Few-Shot Learning (FSL) targets at this case. It can rapidly generalize to new tasks of limited supervised experience by turning to prior knowledge, which mimics human's ability to acquire knowledge from few examples through generalization and analogy. It has been seen as a test-bed for real artificial intelligence, a way to reduce laborious data gathering and computationally costly training, and antidote for rare cases learning. With extensive works on FSL emerging, we give a comprehensive survey for it. We first give the formal definition for FSL. Then we point out the core issues of FSL, which turns the problem from "how to solve FSL" to "how to deal with the core issues". Accordingly, existing works from the birth of FSL to the most recent published ones are categorized in a unified taxonomy, with thorough discussion of the pros and cons for different categories. Finally, we envision possible future directions for FSL in terms of problem setup, techniques, applications and theory, hoping to provide insights to both beginners and experienced researchers.},
+  urldate = {2019-04-11},
+  date = {2019-04-10},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+  author = {Wang, Yaqing and Yao, Quanming},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/8B4V9HNB/Wang and Yao - 2019 - Few-shot Learning A Survey.pdf;/home/dimitri/Nextcloud/Zotero/storage/G9J3I6XP/1904.html}
+}
+
+@thesis{galUncertaintyDeepLearning2016,
+  title = {Uncertainty in {{Deep Learning}}},
+  url = {https://www.semanticscholar.org/paper/Uncertainty-in-Deep-Learning-Gal/3c623c08329e129e784a5d03f7606ec8feba3a28},
+  abstract = {Deep learning has attracted tremendous attention from researchers in various fields of information engineering such as AI, computer vision, and language processing [Kalchbrenner and Blunsom, 2013; Krizhevsky et al., 2012; Mnih et al., 2013], but also from more traditional sciences such as physics, biology, and manufacturing [Anjos et al., 2015; Baldi et al., 2014; Bergmann et al., 2014]. Neural networks, image processing tools such as convolutional neural networks, sequence processing models such as recurrent neural networks, and regularisation tools such as dropout, are used extensively. However, fields such as physics, biology, and manufacturing are ones in which representing model uncertainty is of crucial importance [Ghahramani, 2015; Krzywinski and Altman, 2013]. With the recent shift in many of these fields towards the use of Bayesian uncertainty [Herzog and Ostwald, 2013; Nuzzo, 2014; Trafimow and Marks, 2015], new needs arise from deep learning. In this work we develop tools to obtain practical uncertainty estimates in deep learning, casting recent deep learning tools as Bayesian models without changing either the models or the optimisation. In the first part of this thesis we develop the theory for such tools, providing applications and illustrative examples. We tie approximate inference in Bayesian models to dropout and other stochastic regularisation techniques, and assess the approximations empirically. We give example applications arising from this connection between modern deep learning and Bayesian modelling such as active learning of image data and data efficient deep reinforcement learning. We further demonstrate the method’s practicality through a survey of recent applications making use of the suggested tools in language applications, medical diagnostics, bioinformatics, image processing, and autonomous driving. In the second part of the thesis we explore its theoretical implications, and the insights stemming from the link between Bayesian modelling and deep learning. We discuss what determines model uncertainty properties, analyse the approximate inference analytically in the linear case, and theoretically examine various priors such as spike and slab priors.},
+  institution = {{University of Cambridge}},
+  date = {2016},
+  keywords = {Reinforcement learning,Mathematical optimization,Estimated,Action potential,Approximation algorithm,Attention deficit hyperactivity disorder,Autonomous car,Bayesian network,Bioinformatics,Computer vision,Convolutional neural network,Deep learning,Dropout (neural networks),Image processing,Information engineering,Manufacturing Facilities,Neural Networks,Recurrent neural network,Science,Slab allocation,Stemming},
+  author = {Gal, Yarin},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/4UXBH6G3/Gal - 2016 - Uncertainty in Deep Learning.pdf}
+}
+
+@article{ghahramaniProbabilisticMachineLearning2015,
+  langid = {english},
+  title = {Probabilistic Machine Learning and Artificial Intelligence},
+  volume = {521},
+  issn = {1476-4687},
+  url = {https://www.nature.com/articles/nature14541},
+  doi = {10.1038/nature14541},
+  abstract = {How can a machine learn from experience? Probabilistic modelling provides a framework for understanding what learning is, and has therefore emerged as one of the principal theoretical and practical approaches for designing machines that learn from data acquired through experience. The probabilistic framework, which describes how to represent and manipulate uncertainty about models and predictions, has a central role in scientific data analysis, machine learning, robotics, cognitive science and artificial intelligence. This Review provides an introduction to this framework, and discusses some of the state-of-the-art advances in the field, namely, probabilistic programming, Bayesian optimization, data compression and automatic model discovery.},
+  number = {7553},
+  journaltitle = {Nature},
+  urldate = {2019-04-11},
+  date = {2015-05},
+  pages = {452-459},
+  author = {Ghahramani, Zoubin},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/A98SXN69/Ghahramani - 2015 - Probabilistic machine learning and artificial inte.pdf;/home/dimitri/Nextcloud/Zotero/storage/A5TPUMHQ/nature14541.html}
+}
+
+@article{settlesActiveLearning2012,
+  langid = {english},
+  title = {Active {{Learning}}},
+  volume = {6},
+  issn = {1939-4608, 1939-4616},
+  url = {http://www.morganclaypool.com/doi/abs/10.2200/S00429ED1V01Y201207AIM018},
+  doi = {10.2200/S00429ED1V01Y201207AIM018},
+  number = {1},
+  journaltitle = {Synthesis Lectures on Artificial Intelligence and Machine Learning},
+  urldate = {2019-04-11},
+  date = {2012-06-30},
+  pages = {1-114},
+  author = {Settles, Burr},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/W2QS5HUK/Settles - 2012 - Active Learning.pdf}
+}
+
+@article{shiUnsupervisedDialogStructure2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.03736},
+  primaryClass = {cs},
+  title = {Unsupervised {{Dialog Structure Learning}}},
+  url = {http://arxiv.org/abs/1904.03736},
+  abstract = {Learning a shared dialog structure from a set of task-oriented dialogs is an important challenge in computational linguistics. The learned dialog structure can shed light on how to analyze human dialogs, and more importantly contribute to the design and evaluation of dialog systems. We propose to extract dialog structures using a modified VRNN model with discrete latent vectors. Different from existing HMM-based models, our model is based on variational-autoencoder (VAE). Such model is able to capture more dynamics in dialogs beyond the surface forms of the language. We find that qualitatively, our method extracts meaningful dialog structure, and quantitatively, outperforms previous models on the ability to predict unseen data. We further evaluate the model's effectiveness in a downstream task, the dialog system building task. Experiments show that, by integrating the learned dialog structure into the reward function design, the model converges faster and to a better outcome in a reinforcement learning setting.},
+  urldate = {2019-04-12},
+  date = {2019-04-07},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+  author = {Shi, Weiyan and Zhao, Tiancheng and Yu, Zhou},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/HZ9I7A2E/Shi et al. - 2019 - Unsupervised Dialog Structure Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/R6UBNIII/1904.html}
+}
+
+@article{bronsteinGeometricDeepLearning2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1611.08097},
+  title = {Geometric Deep Learning: Going beyond {{Euclidean}} Data},
+  volume = {34},
+  issn = {1053-5888},
+  url = {http://arxiv.org/abs/1611.08097},
+  doi = {10.1109/MSP.2017.2693418},
+  shorttitle = {Geometric Deep Learning},
+  abstract = {Many scientific fields study data with an underlying structure that is a non-Euclidean space. Some examples include social networks in computational social sciences, sensor networks in communications, functional networks in brain imaging, regulatory networks in genetics, and meshed surfaces in computer graphics. In many applications, such geometric data are large and complex (in the case of social networks, on the scale of billions), and are natural targets for machine learning techniques. In particular, we would like to use deep neural networks, which have recently proven to be powerful tools for a broad range of problems from computer vision, natural language processing, and audio analysis. However, these tools have been most successful on data with an underlying Euclidean or grid-like structure, and in cases where the invariances of these structures are built into networks used to model them. Geometric deep learning is an umbrella term for emerging techniques attempting to generalize (structured) deep neural models to non-Euclidean domains such as graphs and manifolds. The purpose of this paper is to overview different examples of geometric deep learning problems and present available solutions, key difficulties, applications, and future research directions in this nascent field.},
+  number = {4},
+  journaltitle = {IEEE Signal Processing Magazine},
+  urldate = {2019-04-12},
+  date = {2017-07},
+  pages = {18-42},
+  keywords = {Computer Science - Computer Vision and Pattern Recognition},
+  author = {Bronstein, Michael M. and Bruna, Joan and LeCun, Yann and Szlam, Arthur and Vandergheynst, Pierre},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/KQKTILDN/Bronstein et al. - 2017 - Geometric deep learning going beyond Euclidean da.pdf;/home/dimitri/Nextcloud/Zotero/storage/UZNRDHAP/1611.html}
+}
+
+@inproceedings{ferreiraExpertbasedRewardShaping2013,
+  location = {{Olomouc, Czech Republic}},
+  title = {Expert-Based Reward Shaping and Exploration Scheme for Boosting Policy Learning of Dialogue Management},
+  isbn = {978-1-4799-2756-2},
+  url = {http://ieeexplore.ieee.org/document/6707714/},
+  doi = {10.1109/ASRU.2013.6707714},
+  eventtitle = {2013 {{IEEE Workshop}} on {{Automatic Speech Recognition}} \& {{Understanding}} ({{ASRU}})},
+  booktitle = {2013 {{IEEE Workshop}} on {{Automatic Speech Recognition}} and {{Understanding}}},
+  publisher = {{IEEE}},
+  urldate = {2019-04-12},
+  date = {2013-12},
+  pages = {108-113},
+  author = {Ferreira, Emmanuel and Lefevre, Fabrice},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/ND7G7QVU/Ferreira and Lefevre - 2013 - Expert-based reward shaping and exploration scheme.pdf}
+}
+
+@article{zhangAdversarialAttacksDeep2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1901.06796},
+  primaryClass = {cs},
+  title = {Adversarial {{Attacks}} on {{Deep Learning Models}} in {{Natural Language Processing}}: {{A Survey}}},
+  url = {http://arxiv.org/abs/1901.06796},
+  shorttitle = {Adversarial {{Attacks}} on {{Deep Learning Models}} in {{Natural Language Processing}}},
+  abstract = {With the development of high computational devices, deep neural networks (DNNs), in recent years, have gained significant popularity in many Artificial Intelligence (AI) applications. However, previous efforts have shown that DNNs were vulnerable to strategically modified samples, named adversarial examples. These samples are generated with some imperceptible perturbations but can fool the DNNs to give false predictions. Inspired by the popularity of generating adversarial examples for image DNNs, research efforts on attacking DNNs for textual applications emerges in recent years. However, existing perturbation methods for images cannotbe directly applied to texts as text data is discrete. In this article, we review research works that address this difference and generatetextual adversarial examples on DNNs. We collect, select, summarize, discuss and analyze these works in a comprehensive way andcover all the related information to make the article self-contained. Finally, drawing on the reviewed literature, we provide further discussions and suggestions on this topic.},
+  urldate = {2019-04-12},
+  date = {2019-01-21},
+  keywords = {Computer Science - Computation and Language},
+  author = {Zhang, Wei Emma and Sheng, Quan Z. and Alhazmi, Ahoud and Li, Chenliang},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/3YEISHLH/Zhang et al. - 2019 - Adversarial Attacks on Deep Learning Models in Nat.pdf;/home/dimitri/Nextcloud/Zotero/storage/HKVIKPLC/1901.html}
+}
+
+@inproceedings{zahavyGrayingBlackBox2016,
+  langid = {english},
+  title = {Graying the Black Box: {{Understanding DQNs}}},
+  url = {http://proceedings.mlr.press/v48/zahavy16.html},
+  shorttitle = {Graying the Black Box},
+  abstract = {In recent years there is a growing interest in using deep representations for reinforcement learning. In this paper, we present a methodology and tools to analyze Deep Q-networks (DQNs) in a non-bl...},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  booktitle = {International {{Conference}} on {{Machine Learning}}},
+  urldate = {2019-04-12},
+  date = {2016-06-11},
+  pages = {1899-1908},
+  author = {Zahavy, Tom and Ben-Zrihem, Nir and Mannor, Shie},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/A9VKBUZI/zahavy16-supp.pdf;/home/dimitri/Nextcloud/Zotero/storage/FXTSNYTW/Zahavy et al. - 2016 - Graying the black box Understanding DQNs.pdf;/home/dimitri/Nextcloud/Zotero/storage/IVTBF7LM/zahavy16.html}
+}
+
+@article{hendersonDeepReinforcementLearning2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1709.06560},
+  primaryClass = {cs, stat},
+  title = {Deep {{Reinforcement Learning}} That {{Matters}}},
+  url = {http://arxiv.org/abs/1709.06560},
+  abstract = {In recent years, significant progress has been made in solving challenging problems across various domains using deep reinforcement learning (RL). Reproducing existing work and accurately judging the improvements offered by novel methods is vital to sustaining this progress. Unfortunately, reproducing results for state-of-the-art deep RL methods is seldom straightforward. In particular, non-determinism in standard benchmark environments, combined with variance intrinsic to the methods, can make reported results tough to interpret. Without significance metrics and tighter standardization of experimental reporting, it is difficult to determine whether improvements over the prior state-of-the-art are meaningful. In this paper, we investigate challenges posed by reproducibility, proper experimental techniques, and reporting procedures. We illustrate the variability in reported metrics and results when comparing against common baselines and suggest guidelines to make future results in deep RL more reproducible. We aim to spur discussion about how to ensure continued progress in the field by minimizing wasted effort stemming from results that are non-reproducible and easily misinterpreted.},
+  urldate = {2019-04-16},
+  date = {2017-09-19},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Henderson, Peter and Islam, Riashat and Bachman, Philip and Pineau, Joelle and Precup, Doina and Meger, David},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/PJEJP7R9/Henderson et al. - 2017 - Deep Reinforcement Learning that Matters.pdf;/home/dimitri/Nextcloud/Zotero/storage/IV2G8XEY/1709.html}
+}
+
+@article{maniaSimpleRandomSearch2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1803.07055},
+  primaryClass = {cs, math, stat},
+  title = {Simple Random Search Provides a Competitive Approach to Reinforcement Learning},
+  url = {http://arxiv.org/abs/1803.07055},
+  abstract = {A common belief in model-free reinforcement learning is that methods based on random search in the parameter space of policies exhibit significantly worse sample complexity than those that explore the space of actions. We dispel such beliefs by introducing a random search method for training static, linear policies for continuous control problems, matching state-of-the-art sample efficiency on the benchmark MuJoCo locomotion tasks. Our method also finds a nearly optimal controller for a challenging instance of the Linear Quadratic Regulator, a classical problem in control theory, when the dynamics are not known. Computationally, our random search algorithm is at least 15 times more efficient than the fastest competing model-free methods on these benchmarks. We take advantage of this computational efficiency to evaluate the performance of our method over hundreds of random seeds and many different hyperparameter configurations for each benchmark task. Our simulations highlight a high variability in performance in these benchmark tasks, suggesting that commonly used estimations of sample efficiency do not adequately evaluate the performance of RL algorithms.},
+  urldate = {2019-04-16},
+  date = {2018-03-19},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Mathematics - Optimization and Control},
+  author = {Mania, Horia and Guy, Aurelia and Recht, Benjamin},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/S5MWXRFU/Mania et al. - 2018 - Simple random search provides a competitive approa.pdf;/home/dimitri/Nextcloud/Zotero/storage/FHCCPQ7M/1803.html}
+}
+
+@article{zhaoRethinkingActionSpaces2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1902.08858},
+  primaryClass = {cs},
+  title = {Rethinking {{Action Spaces}} for {{Reinforcement Learning}} in {{End}}-to-End {{Dialog Agents}} with {{Latent Variable Models}}},
+  url = {http://arxiv.org/abs/1902.08858},
+  abstract = {Defining action spaces for conversational agents and optimizing their decision-making process with reinforcement learning is an enduring challenge. Common practice has been to use handcrafted dialog acts, or the output vocabulary, e.g. in neural encoder decoders, as the action spaces. Both have their own limitations. This paper proposes a novel latent action framework that treats the action spaces of an end-to-end dialog agent as latent variables and develops unsupervised methods in order to induce its own action space from the data. Comprehensive experiments are conducted examining both continuous and discrete action types and two different optimization methods based on stochastic variational inference. Results show that the proposed latent actions achieve superior empirical performance improvement over previous word-level policy gradient methods on both DealOrNoDeal and MultiWoz dialogs. Our detailed analysis also provides insights about various latent variable approaches for policy learning and can serve as a foundation for developing better latent actions in future research.},
+  urldate = {2019-04-16},
+  date = {2019-02-23},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+  author = {Zhao, Tiancheng and Xie, Kaige and Eskenazi, Maxine},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/75K4H5SF/Zhao et al. - 2019 - Rethinking Action Spaces for Reinforcement Learnin.pdf;/home/dimitri/Nextcloud/Zotero/storage/HLT5EGC8/1902.html}
+}
+
+@article{ratnerDataProgrammingCreating2016,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1605.07723},
+  primaryClass = {cs, stat},
+  title = {Data {{Programming}}: {{Creating Large Training Sets}}, {{Quickly}}},
+  url = {http://arxiv.org/abs/1605.07723},
+  shorttitle = {Data {{Programming}}},
+  abstract = {Large labeled training sets are the critical building blocks of supervised learning methods and are key enablers of deep learning techniques. For some applications, creating labeled training sets is the most time-consuming and expensive part of applying machine learning. We therefore propose a paradigm for the programmatic creation of training sets called data programming in which users express weak supervision strategies or domain heuristics as labeling functions, which are programs that label subsets of the data, but that are noisy and may conflict. We show that by explicitly representing this training set labeling process as a generative model, we can "denoise" the generated training set, and establish theoretically that we can recover the parameters of these generative models in a handful of settings. We then show how to modify a discriminative loss function to make it noise-aware, and demonstrate our method over a range of discriminative models including logistic regression and LSTMs. Experimentally, on the 2014 TAC-KBP Slot Filling challenge, we show that data programming would have led to a new winning score, and also show that applying data programming to an LSTM model leads to a TAC-KBP score almost 6 F1 points over a state-of-the-art LSTM baseline (and into second place in the competition). Additionally, in initial user studies we observed that data programming may be an easier way for non-experts to create machine learning models when training data is limited or unavailable.},
+  urldate = {2019-04-16},
+  date = {2016-05-25},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+  author = {Ratner, Alexander and De Sa, Christopher and Wu, Sen and Selsam, Daniel and Ré, Christopher},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/DM9SNV2M/Ratner et al. - 2016 - Data Programming Creating Large Training Sets, Qu.pdf;/home/dimitri/Nextcloud/Zotero/storage/RJD8M3JQ/1605.html}
+}
+
+@article{ratnerSnorkelRapidTraining2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1711.10160},
+  title = {Snorkel: {{Rapid Training Data Creation}} with {{Weak Supervision}}},
+  volume = {11},
+  issn = {21508097},
+  url = {http://arxiv.org/abs/1711.10160},
+  doi = {10.14778/3157794.3157797},
+  shorttitle = {Snorkel},
+  abstract = {Labeling training data is increasingly the largest bottleneck in deploying machine learning systems. We present Snorkel, a first-of-its-kind system that enables users to train state-of-the-art models without hand labeling any training data. Instead, users write labeling functions that express arbitrary heuristics, which can have unknown accuracies and correlations. Snorkel denoises their outputs without access to ground truth by incorporating the first end-to-end implementation of our recently proposed machine learning paradigm, data programming. We present a flexible interface layer for writing labeling functions based on our experience over the past year collaborating with companies, agencies, and research labs. In a user study, subject matter experts build models 2.8x faster and increase predictive performance an average 45.5\% versus seven hours of hand labeling. We study the modeling tradeoffs in this new setting and propose an optimizer for automating tradeoff decisions that gives up to 1.8x speedup per pipeline execution. In two collaborations, with the U.S. Department of Veterans Affairs and the U.S. Food and Drug Administration, and on four open-source text and image data sets representative of other deployments, Snorkel provides 132\% average improvements to predictive performance over prior heuristic approaches and comes within an average 3.60\% of the predictive performance of large hand-curated training sets.},
+  number = {3},
+  journaltitle = {Proceedings of the VLDB Endowment},
+  urldate = {2019-04-16},
+  date = {2017-11-01},
+  pages = {269-282},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Ratner, Alexander and Bach, Stephen H. and Ehrenberg, Henry and Fries, Jason and Wu, Sen and Ré, Christopher},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/KENDFREY/Ratner et al. - 2017 - Snorkel Rapid Training Data Creation with Weak Su.pdf;/home/dimitri/Nextcloud/Zotero/storage/YWL8DGM3/1711.html}
+}
+
+@article{bachLearningStructureGenerative2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1703.00854},
+  primaryClass = {cs, stat},
+  title = {Learning the {{Structure}} of {{Generative Models}} without {{Labeled Data}}},
+  url = {http://arxiv.org/abs/1703.00854},
+  abstract = {Curating labeled training data has become the primary bottleneck in machine learning. Recent frameworks address this bottleneck with generative models to synthesize labels at scale from weak supervision sources. The generative model's dependency structure directly affects the quality of the estimated labels, but selecting a structure automatically without any labeled data is a distinct challenge. We propose a structure estimation method that maximizes the \$\textbackslash{}ell\_1\$-regularized marginal pseudolikelihood of the observed data. Our analysis shows that the amount of unlabeled data required to identify the true structure scales sublinearly in the number of possible dependencies for a broad class of models. Simulations show that our method is 100\$\textbackslash{}times\$ faster than a maximum likelihood approach and selects \$1/4\$ as many extraneous dependencies. We also show that our method provides an average of 1.5 F1 points of improvement over existing, user-developed information extraction applications on real-world data such as PubMed journal abstracts.},
+  urldate = {2019-04-16},
+  date = {2017-03-02},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Bach, Stephen H. and He, Bryan and Ratner, Alexander and Ré, Christopher},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/CVGAF7ZB/Bach et al. - 2017 - Learning the Structure of Generative Models withou.pdf;/home/dimitri/Nextcloud/Zotero/storage/DZ7NGBTE/1703.html}
+}
+
+@article{varmaLearningDependencyStructures2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1903.05844},
+  primaryClass = {cs, stat},
+  title = {Learning {{Dependency Structures}} for {{Weak Supervision Models}}},
+  url = {http://arxiv.org/abs/1903.05844},
+  abstract = {Labeling training data is a key bottleneck in the modern machine learning pipeline. Recent weak supervision approaches combine labels from multiple noisy sources by estimating their accuracies without access to ground truth labels; however, estimating the dependencies among these sources is a critical challenge. We focus on a robust PCA-based algorithm for learning these dependency structures, establish improved theoretical recovery rates, and outperform existing methods on various real-world tasks. Under certain conditions, we show that the amount of unlabeled data needed can scale sublinearly or even logarithmically with the number of sources \$m\$, improving over previous efforts that ignore the sparsity pattern in the dependency structure and scale linearly in \$m\$. We provide an information-theoretic lower bound on the minimum sample complexity of the weak supervision setting. Our method outperforms weak supervision approaches that assume conditionally-independent sources by up to 4.64 F1 points and previous structure learning approaches by up to 4.41 F1 points on real-world relation extraction and image classification tasks.},
+  urldate = {2019-04-16},
+  date = {2019-03-14},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Varma, Paroma and Sala, Frederic and He, Ann and Ratner, Alexander and Ré, Christopher},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/DEMUUSM8/Varma et al. - 2019 - Learning Dependency Structures for Weak Supervisio.pdf;/home/dimitri/Nextcloud/Zotero/storage/QVVYCHJT/1903.html}
+}
+
+@article{ratnerTrainingComplexModels2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1810.02840},
+  primaryClass = {cs, stat},
+  title = {Training {{Complex Models}} with {{Multi}}-{{Task Weak Supervision}}},
+  url = {http://arxiv.org/abs/1810.02840},
+  abstract = {As machine learning models continue to increase in complexity, collecting large hand-labeled training sets has become one of the biggest roadblocks in practice. Instead, weaker forms of supervision that provide noisier but cheaper labels are often used. However, these weak supervision sources have diverse and unknown accuracies, may output correlated labels, and may label different tasks or apply at different levels of granularity. We propose a framework for integrating and modeling such weak supervision sources by viewing them as labeling different related sub-tasks of a problem, which we refer to as the multi-task weak supervision setting. We show that by solving a matrix completion-style problem, we can recover the accuracies of these multi-task sources given their dependency structure, but without any labeled data, leading to higher-quality supervision for training an end model. Theoretically, we show that the generalization error of models trained with this approach improves with the number of unlabeled data points, and characterize the scaling with respect to the task and dependency structures. On three fine-grained classification problems, we show that our approach leads to average gains of 20.2 points in accuracy over a traditional supervised approach, 6.8 points over a majority vote baseline, and 4.1 points over a previously proposed weak supervision method that models tasks separately.},
+  urldate = {2019-04-16},
+  date = {2018-10-05},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Ratner, Alexander and Hancock, Braden and Dunnmon, Jared and Sala, Frederic and Pandey, Shreyash and Ré, Christopher},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/E38MTCZJ/Ratner et al. - 2018 - Training Complex Models with Multi-Task Weak Super.pdf;/home/dimitri/Nextcloud/Zotero/storage/VIS28T3Q/1810.html}
+}
+
+@article{mallinarBootstrappingConversationalAgents2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1812.06176},
+  primaryClass = {cs},
+  title = {Bootstrapping {{Conversational Agents With Weak Supervision}}},
+  url = {http://arxiv.org/abs/1812.06176},
+  abstract = {Many conversational agents in the market today follow a standard bot development framework which requires training intent classifiers to recognize user input. The need to create a proper set of training examples is often the bottleneck in the development process. In many occasions agent developers have access to historical chat logs that can provide a good quantity as well as coverage of training examples. However, the cost of labeling them with tens to hundreds of intents often prohibits taking full advantage of these chat logs. In this paper, we present a framework called \textbackslash{}textit\{search, label, and propagate\} (SLP) for bootstrapping intents from existing chat logs using weak supervision. The framework reduces hours to days of labeling effort down to minutes of work by using a search engine to find examples, then relies on a data programming approach to automatically expand the labels. We report on a user study that shows positive user feedback for this new approach to build conversational agents, and demonstrates the effectiveness of using data programming for auto-labeling. While the system is developed for training conversational agents, the framework has broader application in significantly reducing labeling effort for training text classifiers.},
+  urldate = {2019-04-16},
+  date = {2018-12-14},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+  author = {Mallinar, Neil and Shah, Abhishek and Ugrani, Rajendra and Gupta, Ayush and Gurusankar, Manikandan and Ho, Tin Kam and Liao, Q. Vera and Zhang, Yunfeng and Bellamy, Rachel K. E. and Yates, Robert and Desmarais, Chris and McGregor, Blake},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/GHW24QAZ/Mallinar et al. - 2018 - Bootstrapping Conversational Agents With Weak Supe.pdf;/home/dimitri/Nextcloud/Zotero/storage/9VGL3XXM/1812.html}
+}
+
+@article{bachSnorkelDryBellCase2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1812.00417},
+  primaryClass = {cs, stat},
+  title = {Snorkel {{DryBell}}: {{A Case Study}} in {{Deploying Weak Supervision}} at {{Industrial Scale}}},
+  url = {http://arxiv.org/abs/1812.00417},
+  shorttitle = {Snorkel {{DryBell}}},
+  abstract = {Labeling training data is one of the most costly bottlenecks in developing or modifying machine learning-based applications. We survey how resources from across an organization can be used as weak supervision sources for three classification tasks at Google, in order to bring development time and cost down by an order of magnitude. We build on the Snorkel framework, extending it as a new system, Snorkel DryBell, which integrates with Google's distributed production systems and enables engineers to develop and execute weak supervision strategies over millions of examples in less than thirty minutes. We find that Snorkel DryBell creates classifiers of comparable quality to ones trained using up to tens of thousands of hand-labeled examples, in part by leveraging organizational resources not servable in production which contribute an average 52\% performance improvement to the weakly supervised classifiers.},
+  urldate = {2019-04-16},
+  date = {2018-12-02},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Bach, Stephen H. and Rodriguez, Daniel and Liu, Yintao and Luo, Chong and Shao, Haidong and Xia, Cassandra and Sen, Souvik and Ratner, Alexander and Hancock, Braden and Alborzi, Houman and Kuchhal, Rahul and Ré, Christopher and Malkin, Rob},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/HCP7XGTD/Bach et al. - 2018 - Snorkel DryBell A Case Study in Deploying Weak Su.pdf;/home/dimitri/Nextcloud/Zotero/storage/Q44AHQN4/1812.html}
+}
+
+@online{HarnessingOrganizationalKnowledge,
+  langid = {english},
+  title = {Harnessing {{Organizational Knowledge}} for {{Machine Learning}}},
+  url = {http://ai.googleblog.com/2019/03/harnessing-organizational-knowledge-for.html},
+  abstract = {Posted by Alex Ratner, Stanford University and Cassandra Xia, Google AI     One of the biggest bottlenecks in developing machine learning (M...},
+  journaltitle = {Google AI Blog},
+  urldate = {2019-04-16},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/496PMGC5/harnessing-organizational-knowledge-for.html}
+}
+
+@article{keneshlooDeepReinforcementLearning2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1805.09461},
+  primaryClass = {cs, stat},
+  title = {Deep {{Reinforcement Learning For Sequence}} to {{Sequence Models}}},
+  url = {http://arxiv.org/abs/1805.09461},
+  abstract = {In recent times, sequence-to-sequence (seq2seq) models have gained a lot of popularity and provide state-of-the-art performance in a wide variety of tasks such as machine translation, headline generation, text summarization, speech to text conversion, and image caption generation. The underlying framework for all these models is usually a deep neural network comprising an encoder and a decoder. Although simple encoder-decoder models produce competitive results, many researchers have proposed additional improvements over these sequence-to-sequence models, e.g., using an attention-based model over the input, pointer-generation models, and self-attention models. However, such seq2seq models suffer from two common problems: 1) exposure bias and 2) inconsistency between train/test measurement. Recently, a completely novel point of view has emerged in addressing these two problems in seq2seq models, leveraging methods from reinforcement learning (RL). In this survey, we consider seq2seq problems from the RL point of view and provide a formulation combining the power of RL methods in decision-making with sequence-to-sequence models that enable remembering long-term memories. We present some of the most recent frameworks that combine concepts from RL and deep neural networks and explain how these two areas could benefit from each other in solving complex seq2seq tasks. Our work aims to provide insights into some of the problems that inherently arise with current approaches and how we can address them with better RL models. We also provide the source code for implementing most of the RL models discussed in this paper to support the complex task of abstractive text summarization.},
+  urldate = {2019-04-17},
+  date = {2018-05-23},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning,I.2.6,I.2.10,I.2.7},
+  author = {Keneshloo, Yaser and Shi, Tian and Ramakrishnan, Naren and Reddy, Chandan K.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/I9CLUT8Y/Keneshloo et al. - 2018 - Deep Reinforcement Learning For Sequence to Sequen.pdf;/home/dimitri/Nextcloud/Zotero/storage/4RIQDPI5/1805.html}
+}
+
+@article{hendersonRepositoryConversationalDatasets2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.06472},
+  primaryClass = {cs},
+  title = {A {{Repository}} of {{Conversational Datasets}}},
+  url = {http://arxiv.org/abs/1904.06472},
+  abstract = {Progress in Machine Learning is often driven by the availability of large datasets, and consistent evaluation metrics for comparing modeling approaches. To this end, we present a repository of conversational datasets consisting of hundreds of millions of examples, and a standardised evaluation procedure for conversational response selection models using '1-of-100 accuracy'. The repository contains scripts that allow researchers to reproduce the standard datasets, or to adapt the pre-processing and data filtering steps to their needs. We introduce and evaluate several competitive baselines for conversational response selection, whose implementations are shared in the repository, as well as a neural encoder model that is trained on the entire training set.},
+  urldate = {2019-04-18},
+  date = {2019-04-12},
+  keywords = {Computer Science - Computation and Language},
+  author = {Henderson, Matthew and Budzianowski, Paweł and Casanueva, Iñigo and Coope, Sam and Gerz, Daniela and Kumar, Girish and Mrkšić, Nikola and Spithourakis, Georgios and Su, Pei-Hao and Vulić, Ivan and Wen, Tsung-Hsien},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/ZPI7GB2I/Henderson et al. - 2019 - A Repository of Conversational Datasets.pdf;/home/dimitri/Nextcloud/Zotero/storage/II559GXU/1904.html}
+}
+
+@article{wangWassersteinFisherRaoDocumentDistance2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.10294},
+  primaryClass = {cs, stat},
+  title = {Wasserstein-{{Fisher}}-{{Rao Document Distance}}},
+  url = {http://arxiv.org/abs/1904.10294},
+  abstract = {As a fundamental problem of natural language processing, it is important to measure the distance between different documents. Among the existing methods, the Word Mover's Distance (WMD) has shown remarkable success in document semantic matching for its clear physical insight as a parameter-free model. However, WMD is essentially based on the classical Wasserstein metric, thus it often fails to robustly represent the semantic similarity between texts of different lengths. In this paper, we apply the newly developed Wasserstein-Fisher-Rao (WFR) metric from unbalanced optimal transport theory to measure the distance between different documents. The proposed WFR document distance maintains the great interpretability and simplicity as WMD. We demonstrate that the WFR document distance has significant advantages when comparing the texts of different lengths. In addition, an accelerated Sinkhorn based algorithm with GPU implementation has been developed for the fast computation of WFR distances. The KNN classification results on eight datasets have shown its clear improvement over WMD.},
+  urldate = {2019-04-24},
+  date = {2019-04-23},
+  keywords = {Statistics - Machine Learning,Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Wang, Zihao and Zhou, Datong and Zhang, Yong and Wu, Hao and Bao, Chenglong},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/CBKF4YMF/Wang et al. - 2019 - Wasserstein-Fisher-Rao Document Distance.pdf;/home/dimitri/Nextcloud/Zotero/storage/VN8HQ7Z5/1904.html}
+}
+
+@article{fordeScientificMethodScience2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.10922},
+  primaryClass = {cs, stat},
+  title = {The {{Scientific Method}} in the {{Science}} of {{Machine Learning}}},
+  url = {http://arxiv.org/abs/1904.10922},
+  abstract = {In the quest to align deep learning with the sciences to address calls for rigor, safety, and interpretability in machine learning systems, this contribution identifies key missing pieces: the stages of hypothesis formulation and testing, as well as statistical and systematic uncertainty estimation -- core tenets of the scientific method. This position paper discusses the ways in which contemporary science is conducted in other domains and identifies potentially useful practices. We present a case study from physics and describe how this field has promoted rigor through specific methodological practices, and provide recommendations on how machine learning researchers can adopt these practices into the research ecosystem. We argue that both domain-driven experiments and application-agnostic questions of the inner workings of fundamental building blocks of machine learning models ought to be examined with the tools of the scientific method, to ensure we not only understand effect, but also begin to understand cause, which is the raison d'\textbackslash\^\{e\}tre of science.},
+  urldate = {2019-04-25},
+  date = {2019-04-24},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Forde, Jessica Zosa and Paganini, Michela},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/YNMX99HU/Forde and Paganini - 2019 - The Scientific Method in the Science of Machine Le.pdf;/home/dimitri/Nextcloud/Zotero/storage/HEEYUG86/1904.html}
+}
+
+@article{debieStochasticDeepNetworks2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1811.07429},
+  primaryClass = {cs, stat},
+  title = {Stochastic {{Deep Networks}}},
+  url = {http://arxiv.org/abs/1811.07429},
+  abstract = {Machine learning is increasingly targeting areas where input data cannot be accurately described by a single vector, but can be modeled instead using the more flexible concept of random vectors, namely probability measures or more simply point clouds of varying cardinality. Using deep architectures on measures poses, however, many challenging issues. Indeed, deep architectures are originally designed to handle fixedlength vectors, or, using recursive mechanisms, ordered sequences thereof. In sharp contrast, measures describe a varying number of weighted observations with no particular order. We propose in this work a deep framework designed to handle crucial aspects of measures, namely permutation invariances, variations in weights and cardinality. Architectures derived from this pipeline can (i) map measures to measures - using the concept of push-forward operators; (ii) bridge the gap between measures and Euclidean spaces - through integration steps. This allows to design discriminative networks (to classify or reduce the dimensionality of input measures), generative architectures (to synthesize measures) and recurrent pipelines (to predict measure dynamics). We provide a theoretical analysis of these building blocks, review our architectures' approximation abilities and robustness w.r.t. perturbation, and try them on various discriminative and generative tasks.},
+  urldate = {2019-04-25},
+  date = {2018-11-18},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {family=Bie, given=Gwendoline, prefix=de, useprefix=true and Peyré, Gabriel and Cuturi, Marco},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/6P7NQ9RK/de Bie et al. - 2018 - Stochastic Deep Networks.pdf;/home/dimitri/Nextcloud/Zotero/storage/PIGZRQRU/1811.html}
+}
+
+@article{grothendieckQuelquesPointsAlgebre1957,
+  langid = {english},
+  title = {Sur Quelques Points d'algèbre Homologique, {{I}}},
+  volume = {9},
+  issn = {0040-8735},
+  url = {http://projecteuclid.org/euclid.tmj/1178244839},
+  doi = {10.2748/tmj/1178244839},
+  number = {2},
+  journaltitle = {Tohoku Mathematical Journal},
+  shortjournal = {Tohoku Math. J.},
+  urldate = {2019-04-29},
+  date = {1957},
+  pages = {119-221},
+  author = {Grothendieck, Alexander},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/UF4CX7FF/Grothendieck - 1957 - Sur quelques points d'algèbre homologique, I.pdf}
+}
+
+@article{fongSevenSketchesCompositionality2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1803.05316},
+  primaryClass = {math},
+  title = {Seven {{Sketches}} in {{Compositionality}}: {{An Invitation}} to {{Applied Category Theory}}},
+  url = {http://arxiv.org/abs/1803.05316},
+  shorttitle = {Seven {{Sketches}} in {{Compositionality}}},
+  abstract = {This book is an invitation to discover advanced topics in category theory through concrete, real-world examples. It aims to give a tour: a gentle, quick introduction to guide later exploration. The tour takes place over seven sketches, each pairing an evocative application, such as databases, electric circuits, or dynamical systems, with the exploration of a categorical structure, such as adjoint functors, enriched categories, or toposes. No prior knowledge of category theory is assumed. A feedback form for typos, comments, questions, and suggestions is available here: https://docs.google.com/document/d/160G9OFcP5DWT8Stn7TxdVx83DJnnf7d5GML0\_FOD5Wg/edit},
+  urldate = {2019-04-29},
+  date = {2018-03-14},
+  keywords = {Mathematics - Category Theory,18-01},
+  author = {Fong, Brendan and Spivak, David I.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/WBUCWRPK/Fong and Spivak - 2018 - Seven Sketches in Compositionality An Invitation .pdf;/home/dimitri/Nextcloud/Zotero/storage/MT7MPULY/1803.html}
+}
+
+@article{chenNeuralNaturalLanguage2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1711.04289},
+  primaryClass = {cs},
+  title = {Neural {{Natural Language Inference Models Enhanced}} with {{External Knowledge}}},
+  url = {http://arxiv.org/abs/1711.04289},
+  abstract = {Modeling natural language inference is a very challenging task. With the availability of large annotated data, it has recently become feasible to train complex models such as neural-network-based inference models, which have shown to achieve the state-of-the-art performance. Although there exist relatively large annotated data, can machines learn all knowledge needed to perform natural language inference (NLI) from these data? If not, how can neural-network-based NLI models benefit from external knowledge and how to build NLI models to leverage it? In this paper, we enrich the state-of-the-art neural natural language inference models with external knowledge. We demonstrate that the proposed models improve neural NLI models to achieve the state-of-the-art performance on the SNLI and MultiNLI datasets.},
+  urldate = {2019-04-30},
+  date = {2017-11-12},
+  keywords = {Computer Science - Computation and Language},
+  author = {Chen, Qian and Zhu, Xiaodan and Ling, Zhen-Hua and Inkpen, Diana and Wei, Si},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/TM3J9VAZ/Chen et al. - 2017 - Neural Natural Language Inference Models Enhanced .pdf;/home/dimitri/Nextcloud/Zotero/storage/5KNZ9MVE/1711.html}
+}
+
+@article{kolouriOptimalMassTransport2017,
+  title = {Optimal {{Mass Transport}}: {{Signal}} Processing and Machine-Learning Applications},
+  volume = {34},
+  issn = {1053-5888},
+  url = {http://ieeexplore.ieee.org/document/7974883/},
+  doi = {10.1109/MSP.2017.2695801},
+  shorttitle = {Optimal {{Mass Transport}}},
+  number = {4},
+  journaltitle = {IEEE Signal Processing Magazine},
+  shortjournal = {IEEE Signal Process. Mag.},
+  urldate = {2019-04-30},
+  date = {2017-07},
+  pages = {43-59},
+  author = {Kolouri, Soheil and Park, Se Rim and Thorpe, Matthew and Slepcev, Dejan and Rohde, Gustavo K.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/9YQEAT7J/Kolouri et al. - 2017 - Optimal Mass Transport Signal processing and mach.pdf}
+}
+
+@article{schmidtSilurianHypothesisWould2019,
+  langid = {english},
+  title = {The {{Silurian}} Hypothesis: Would It Be Possible to Detect an Industrial Civilization in the Geological Record?},
+  volume = {18},
+  issn = {1473-5504, 1475-3006},
+  url = {https://www.cambridge.org/core/product/identifier/S1473550418000095/type/journal_article},
+  doi = {10.1017/S1473550418000095},
+  shorttitle = {The {{Silurian}} Hypothesis},
+  abstract = {Abstract
+            If an industrial civilization had existed on Earth many millions of years prior to our own era, what traces would it have left and would they be detectable today? We summarize the likely geological fingerprint of the Anthropocene, and demonstrate that while clear, it will not differ greatly in many respects from other known events in the geological record. We then propose tests that could plausibly distinguish an industrial cause from an otherwise naturally occurring climate event.},
+  number = {2},
+  journaltitle = {International Journal of Astrobiology},
+  shortjournal = {International Journal of Astrobiology},
+  urldate = {2019-05-01},
+  date = {2019-04},
+  pages = {142-150},
+  author = {Schmidt, Gavin A. and Frank, Adam},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/XLHC4GEB/Schmidt_Frank_2019_The Silurian hypothesis.pdf}
+}
+
+@article{aschOpinionsSocialPressure1955,
+  title = {Opinions and {{Social Pressure}}},
+  volume = {193},
+  issn = {0036-8733},
+  url = {https://www.jstor.org/stable/24943779},
+  number = {5},
+  journaltitle = {Scientific American},
+  urldate = {2019-05-01},
+  date = {1955},
+  pages = {31-35},
+  author = {Asch, Solomon E.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/99479367/Asch_1955_Opinions and Social Pressure.pdf}
+}
+
+@article{slivkinsIntroductionMultiArmedBandits2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.07272},
+  primaryClass = {cs, stat},
+  title = {Introduction to {{Multi}}-{{Armed Bandits}}},
+  url = {http://arxiv.org/abs/1904.07272},
+  abstract = {Multi-armed bandits a simple but very powerful framework for algorithms that make decisions over time under uncertainty. An enormous body of work has accumulated over the years, covered in several books and surveys. This book provides a more introductory, textbook-like treatment of the subject. Each chapter tackles a particular line of work, providing a self-contained, teachable technical introduction and a review of the more advanced results. The chapters are as follows: Stochastic bandits; Lower bounds; Bayesian Bandits and Thompson Sampling; Lipschitz Bandits; Full Feedback and Adversarial Costs; Adversarial Bandits; Linear Costs and Semi-bandits; Contextual Bandits; Bandits and Zero-Sum Games; Bandits with Knapsacks; Incentivized Exploration and Connections to Mechanism Design.},
+  urldate = {2019-05-02},
+  date = {2019-04-15},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Data Structures and Algorithms},
+  author = {Slivkins, Aleksandrs},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/2TJYMBL9/Slivkins - 2019 - Introduction to Multi-Armed Bandits.pdf;/home/dimitri/Nextcloud/Zotero/storage/EUHLP8I9/1904.html}
+}
+
+@article{botvinickReinforcementLearningFast2019,
+  langid = {english},
+  title = {Reinforcement {{Learning}}, {{Fast}} and {{Slow}}},
+  issn = {13646613},
+  url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661319300610},
+  doi = {10.1016/j.tics.2019.02.006},
+  journaltitle = {Trends in Cognitive Sciences},
+  shortjournal = {Trends in Cognitive Sciences},
+  urldate = {2019-05-03},
+  date = {2019-04},
+  pages = {S1364661319300610},
+  author = {Botvinick, Matthew and Ritter, Sam and Wang, Jane X. and Kurth-Nelson, Zeb and Blundell, Charles and Hassabis, Demis},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/RXKZECCA/Botvinick et al. - 2019 - Reinforcement Learning, Fast and Slow.pdf}
+}
+
+@inproceedings{kimEfficientBayesianInference2013,
+  location = {{Istanbul, Turkey}},
+  title = {Efficient {{Bayesian}} Inference Methods via Convex Optimization and Optimal Transport},
+  isbn = {978-1-4799-0446-4},
+  url = {http://ieeexplore.ieee.org/document/6620628/},
+  doi = {10.1109/ISIT.2013.6620628},
+  eventtitle = {2013 {{IEEE International Symposium}} on {{Information Theory}} ({{ISIT}})},
+  booktitle = {2013 {{IEEE International Symposium}} on {{Information Theory}}},
+  publisher = {{IEEE}},
+  urldate = {2019-05-03},
+  date = {2013-07},
+  pages = {2259-2263},
+  author = {Kim, Sanggyun and Ma, Rui and Mesa, Diego and Coleman, Todd P.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/9RGN7V3H/Kim et al. - 2013 - Efficient Bayesian inference methods via convex op.pdf}
+}
+
+@article{rahwanMachineBehaviour2019,
+  langid = {english},
+  title = {Machine Behaviour},
+  volume = {568},
+  issn = {0028-0836, 1476-4687},
+  url = {http://www.nature.com/articles/s41586-019-1138-y},
+  doi = {10.1038/s41586-019-1138-y},
+  number = {7753},
+  journaltitle = {Nature},
+  shortjournal = {Nature},
+  urldate = {2019-05-03},
+  date = {2019-04},
+  pages = {477-486},
+  author = {Rahwan, Iyad and Cebrian, Manuel and Obradovich, Nick and Bongard, Josh and Bonnefon, Jean-François and Breazeal, Cynthia and Crandall, Jacob W. and Christakis, Nicholas A. and Couzin, Iain D. and Jackson, Matthew O. and Jennings, Nicholas R. and Kamar, Ece and Kloumann, Isabel M. and Larochelle, Hugo and Lazer, David and McElreath, Richard and Mislove, Alan and Parkes, David C. and Pentland, Alex ‘Sandy’ and Roberts, Margaret E. and Shariff, Azim and Tenenbaum, Joshua B. and Wellman, Michael},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/VDKW2YEL/Rahwan et al. - 2019 - Machine behaviour.pdf}
+}
+
+@article{ducklowUpperOceanCarbon2001,
+  title = {Upper {{Ocean Carbon Export}} and the {{Biological Pump}}},
+  volume = {14},
+  issn = {10428275},
+  url = {https://tos.org/oceanography/article/upper-ocean-carbon-export-and-the-biological-pump},
+  doi = {10.5670/oceanog.2001.06},
+  number = {4},
+  journaltitle = {Oceanography},
+  shortjournal = {oceanog},
+  urldate = {2019-05-04},
+  date = {2001},
+  pages = {50-58},
+  author = {Ducklow, Hugh and Steinberg, Deborah and Buesseler, Ken},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/ZZ25525E/Ducklow et al_2001_Upper Ocean Carbon Export and the Biological Pump.pdf}
+}
+
+@article{monroeEcoevolutionaryDynamicsCarbon2018,
+  langid = {english},
+  title = {Ecoevolutionary {{Dynamics}} of {{Carbon Cycling}} in the {{Anthropocene}}},
+  volume = {33},
+  issn = {01695347},
+  url = {https://linkinghub.elsevier.com/retrieve/pii/S0169534717303245},
+  doi = {10.1016/j.tree.2017.12.006},
+  number = {3},
+  journaltitle = {Trends in Ecology \& Evolution},
+  shortjournal = {Trends in Ecology \& Evolution},
+  urldate = {2019-05-04},
+  date = {2018-03},
+  pages = {213-225},
+  author = {Monroe, J. Grey and Markman, David W. and Beck, Whitney S. and Felton, Andrew J. and Vahsen, Megan L. and Pressler, Yamina},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/5HAIPSSV/Monroe et al_2018_Ecoevolutionary Dynamics of Carbon Cycling in the Anthropocene.pdf}
+}
+
+@article{wommackVirioplanktonVirusesAquatic2000,
+  langid = {english},
+  title = {Virioplankton: {{Viruses}} in {{Aquatic Ecosystems}}},
+  volume = {64},
+  issn = {1092-2172},
+  url = {http://mmbr.asm.org/cgi/doi/10.1128/MMBR.64.1.69-114.2000},
+  doi = {10.1128/MMBR.64.1.69-114.2000},
+  shorttitle = {Virioplankton},
+  number = {1},
+  journaltitle = {Microbiology and Molecular Biology Reviews},
+  shortjournal = {Microbiology and Molecular Biology Reviews},
+  urldate = {2019-05-04},
+  date = {2000-03-01},
+  pages = {69-114},
+  author = {Wommack, K. E. and Colwell, R. R.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/N5ZGVA2P/Wommack_Colwell_2000_Virioplankton.pdf}
+}
+
+@article{munnVirusesPathogensMarine2006,
+  langid = {english},
+  title = {Viruses as Pathogens of Marine Organisms—from Bacteria to Whales},
+  volume = {86},
+  issn = {0025-3154, 1469-7769},
+  url = {https://www.cambridge.org/core/product/identifier/S002531540601335X/type/journal_article},
+  doi = {10.1017/S002531540601335X},
+  abstract = {Viruses are the most abundant members of marine ecosystems and play an enormous role in ocean processes through their interactions with all types of marine organisms. This short review provides examples of the dramatic increase in our knowledge of the diversity of marine viruses as pathogens of bacteria, protists, molluscs, crustaceans, cnidaria, reptiles, fish and mammals. Several examples are provided showing evidence of evolution of new strains, changes in virulence, and transfer of viruses between ecosystems. The natural and anthropogenic causes of these shifts are discussed. Despite considerable advances in recent years, knowledge of the importance of viruses in many important groups of marine organisms is lacking or incomplete. Suggestions for future investigations necessary to understand the dynamics of biogeochemical processes and the impacts of disease in our oceans are proposed.},
+  number = {3},
+  journaltitle = {Journal of the Marine Biological Association of the United Kingdom},
+  shortjournal = {J. Mar. Biol. Ass.},
+  urldate = {2019-05-04},
+  date = {2006-06},
+  pages = {453-467},
+  author = {Munn, Colin B.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/HRU3Z5XC/Munn_2006_Viruses as pathogens of marine organisms—from bacteria to whales.pdf}
+}
+
+@article{ravenAquaticVirusesEmerging2006,
+  langid = {english},
+  title = {Aquatic Viruses: The Emerging Story},
+  volume = {86},
+  issn = {0025-3154, 1469-7769},
+  url = {https://www.cambridge.org/core/product/identifier/S0025315406013348/type/journal_article},
+  doi = {10.1017/S0025315406013348},
+  shorttitle = {Aquatic Viruses},
+  abstract = {It is likely that all living organisms can be infected by 
+one or more viruses. One of the latest higher taxa to be 
+converted from ‘no characterized viruses’ to ‘well  characterized 
+viruses’ are the diatoms (Bacillariophyceae, 
+Heterokontophyta) with the recent publication of three 
+papers characterizing an ssRNA and a ssDNA virus from 
+two genera (
+              Chaetoceros
+              and
+              Rhizosolenia
+              ) of marine  planktonic 
+diatom (Nagasaki et al., 2004, 2005; Bettarel et al., 
+2005). It would have been strange if viruses had not been 
+able to exploit the dominant, in terms of global primary 
+production, photosynthetic organisms in the ocean (assimilating 
+perhaps as much as 20 Pg inorganic C into organic 
+C per year), despite the less than completely convincing 
+arguments assembled by Raven \& Waite (2004) as to 
+possible anti-viral defences unique to diatoms.},
+  number = {3},
+  journaltitle = {Journal of the Marine Biological Association of the United Kingdom},
+  shortjournal = {J. Mar. Biol. Ass.},
+  urldate = {2019-05-04},
+  date = {2006-06},
+  pages = {449-451},
+  author = {Raven, John A.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/PFL36J3D/Raven_2006_Aquatic viruses.pdf}
+}
+
+@article{suttleMarineVirusesMajor2007,
+  langid = {english},
+  title = {Marine Viruses — Major Players in the Global Ecosystem},
+  volume = {5},
+  issn = {1740-1526, 1740-1534},
+  url = {http://www.nature.com/articles/nrmicro1750},
+  doi = {10.1038/nrmicro1750},
+  number = {10},
+  journaltitle = {Nature Reviews Microbiology},
+  shortjournal = {Nat Rev Microbiol},
+  urldate = {2019-05-04},
+  date = {2007-10},
+  pages = {801-812},
+  author = {Suttle, Curtis A.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/LQ6LQA2N/Suttle_2007_Marine viruses — major players in the global ecosystem.pdf}
+}
+
+@article{awodeyStructuralismInvarianceUnivalence2014,
+  langid = {english},
+  title = {Structuralism, {{Invariance}}, and {{Univalence}}},
+  volume = {22},
+  issn = {0031-8019, 1744-6406},
+  url = {https://academic.oup.com/philmat/article-lookup/doi/10.1093/philmat/nkt030},
+  doi = {10.1093/philmat/nkt030},
+  number = {1},
+  journaltitle = {Philosophia Mathematica},
+  shortjournal = {Philosophia Mathematica},
+  urldate = {2019-05-06},
+  date = {2014-02-01},
+  pages = {1-11},
+  author = {Awodey, S.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/LYGHZV5Y/awodey2013.pdf}
+}
+
+@article{dulac-arnoldChallengesRealWorldReinforcement2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.12901},
+  primaryClass = {cs, stat},
+  title = {Challenges of {{Real}}-{{World Reinforcement Learning}}},
+  url = {http://arxiv.org/abs/1904.12901},
+  abstract = {Reinforcement learning (RL) has proven its worth in a series of artificial domains, and is beginning to show some successes in real-world scenarios. However, much of the research advances in RL are often hard to leverage in real-world systems due to a series of assumptions that are rarely satisfied in practice. We present a set of nine unique challenges that must be addressed to productionize RL to real world problems. For each of these challenges, we specify the exact meaning of the challenge, present some approaches from the literature, and specify some metrics for evaluating that challenge. An approach that addresses all nine challenges would be applicable to a large number of real world problems. We also present an example domain that has been modified to present these challenges as a testbed for practical RL research.},
+  urldate = {2019-05-07},
+  date = {2019-04-29},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+  author = {Dulac-Arnold, Gabriel and Mankowitz, Daniel and Hester, Todd},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/3RZPJH7U/Dulac-Arnold et al. - 2019 - Challenges of Real-World Reinforcement Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/BXN8X7BS/1904.html}
+}
+
+@article{yiCoherentEngagingSpoken2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1904.13015},
+  primaryClass = {cs},
+  title = {Towards {{Coherent}} and {{Engaging Spoken Dialog Response Generation Using Automatic Conversation Evaluators}}},
+  url = {http://arxiv.org/abs/1904.13015},
+  abstract = {Encoder-decoder based neural architectures serve as the basis of state-of-the-art approaches in end-to-end open domain dialog systems. Since most of such systems are trained with a maximum likelihood(MLE) objective they suffer from issues such as lack of generalizability and the generic response problem, i.e., a system response that can be an answer to a large number of user utterances, e.g., "Maybe, I don't know." Having explicit feedback on the relevance and interestingness of a system response at each turn can be a useful signal for mitigating such issues and improving system quality by selecting responses from different approaches. Towards this goal, we present a system that evaluates chatbot responses at each dialog turn for coherence and engagement. Our system provides explicit turn-level dialog quality feedback, which we show to be highly correlated with human evaluation. To show that incorporating this feedback in the neural response generation models improves dialog quality, we present two different and complementary mechanisms to incorporate explicit feedback into a neural response generation model: reranking and direct modification of the loss function during training. Our studies show that a response generation model that incorporates these combined feedback mechanisms produce more engaging and coherent responses in an open-domain spoken dialog setting, significantly improving the response quality using both automatic and human evaluation.},
+  urldate = {2019-05-07},
+  date = {2019-04-29},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Yi, Sanghyun and Goel, Rahul and Khatri, Chandra and Cervone, Alessandra and Chung, Tagyoung and Hedayatnia, Behnam and Venkatesh, Anu and Gabriel, Raefer and Hakkani-Tur, Dilek},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/RTMASP7G/Yi et al. - 2019 - Towards Coherent and Engaging Spoken Dialog Respon.pdf;/home/dimitri/Nextcloud/Zotero/storage/LUTDPQK5/1904.html}
+}
+
+@inproceedings{stolckeDialogActModelling1998,
+  langid = {english},
+  title = {Dialog Act Modelling for Conversational Speech},
+  isbn = {978-1-57735-046-0},
+  url = {https://www.era.lib.ed.ac.uk/handle/1842/1045},
+  abstract = {We describe an integrated approach for statistical modeling of discourse structure for natural conversational speech. Our model is based on 42 'dialog acts’ (e.g., Statement, Question, Backchannel, Agreement, Disagreement, Apology), which were hand-labeled in 1155 conversations from the Switchboard corpus of spontaneous human-to-human telephone speech. We developed several models and algorithms to automatically detect dialog acts from transcribed or automatically recognized words and from prosodic properties of the speech signal, and by using a statistical discourse grammar. All of these components were probabilistic in nature and estimated from data, employing a variety of techniques (hidden Markov models, N-gram language models, maximum entropy estimation, decision tree classifiers, and neural networks). In preliminary studies, we achieved a dialog act labeling accuracy of 65\% based on recognized words and prosody, and an accuracy of 72\% based on word transcripts. Since humans achieve 84\% on this task (with chance performance at 35\%) we find these results encouraging.},
+  publisher = {{AAAI Press}},
+  urldate = {2019-05-07},
+  date = {1998},
+  author = {Stolcke, Andreas and Shriberg, Elizabeth and Bates, Rebecca and Coccaro, Noah and Jurafsky, Daniel and Martin, Rachel and Meteer, Marie and Ries, Klaus and Taylor, Paul and Van Ess-Dykema, Carol},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/HR25EBGW/Stolcke et al. - 1998 - Dialog act modelling for conversational speech.pdf;/home/dimitri/Nextcloud/Zotero/storage/QCW9JBZL/1045.html}
+}
+
+@incollection{courtyDomainAdaptationRegularized2014,
+  location = {{Berlin, Heidelberg}},
+  title = {Domain {{Adaptation}} with {{Regularized Optimal Transport}}},
+  volume = {8724},
+  isbn = {978-3-662-44847-2 978-3-662-44848-9},
+  url = {http://link.springer.com/10.1007/978-3-662-44848-9_18},
+  booktitle = {Machine {{Learning}} and {{Knowledge Discovery}} in {{Databases}}},
+  publisher = {{Springer Berlin Heidelberg}},
+  urldate = {2019-05-09},
+  date = {2014},
+  pages = {274-289},
+  author = {Courty, Nicolas and Flamary, Rémi and Tuia, Devis},
+  editor = {Calders, Toon and Esposito, Floriana and Hüllermeier, Eyke and Meo, Rosa},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/5TY3JXAY/Courty et al. - 2014 - Domain Adaptation with Regularized Optimal Transpo.pdf},
+  doi = {10.1007/978-3-662-44848-9_18}
+}
+
+@article{courtyOptimalTransportDomain2017,
+  title = {Optimal {{Transport}} for {{Domain Adaptation}}},
+  volume = {39},
+  issn = {0162-8828, 2160-9292},
+  url = {http://ieeexplore.ieee.org/document/7586038/},
+  doi = {10.1109/TPAMI.2016.2615921},
+  number = {9},
+  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  shortjournal = {IEEE Trans. Pattern Anal. Mach. Intell.},
+  urldate = {2019-05-09},
+  date = {2017-09-01},
+  pages = {1853-1865},
+  author = {Courty, Nicolas and Flamary, Remi and Tuia, Devis and Rakotomamonjy, Alain},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/6FUGX4KX/Courty et al. - 2017 - Optimal Transport for Domain Adaptation.pdf}
+}
+
+@article{elmoselhyBayesianInferenceOptimal2012,
+  langid = {english},
+  title = {Bayesian Inference with Optimal Maps},
+  volume = {231},
+  issn = {00219991},
+  url = {https://linkinghub.elsevier.com/retrieve/pii/S0021999112003956},
+  doi = {10.1016/j.jcp.2012.07.022},
+  number = {23},
+  journaltitle = {Journal of Computational Physics},
+  shortjournal = {Journal of Computational Physics},
+  urldate = {2019-05-09},
+  date = {2012-10},
+  pages = {7815-7850},
+  author = {El Moselhy, Tarek A. and Marzouk, Youssef M.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/AK5CQTRP/El Moselhy and Marzouk - 2012 - Bayesian inference with optimal maps.pdf}
+}
+
+@incollection{frognerLearningWassersteinLoss2015,
+  title = {Learning with a {{Wasserstein Loss}}},
+  url = {http://papers.nips.cc/paper/5679-learning-with-a-wasserstein-loss.pdf},
+  booktitle = {Advances in {{Neural Information Processing Systems}} 28},
+  publisher = {{Curran Associates, Inc.}},
+  urldate = {2019-05-09},
+  date = {2015},
+  pages = {2053--2061},
+  author = {Frogner, Charlie and Zhang, Chiyuan and Mobahi, Hossein and Araya, Mauricio and Poggio, Tomaso A},
+  editor = {Cortes, C. and Lawrence, N. D. and Lee, D. D. and Sugiyama, M. and Garnett, R.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/AJGYHVMN/Frogner et al. - 2015 - Learning with a Wasserstein Loss.pdf;/home/dimitri/Nextcloud/Zotero/storage/LIAAVV45/5679-learning-with-a-wasserstein-loss.html}
+}
+
+@article{bonneelSlicedRadonWasserstein2015,
+  langid = {english},
+  title = {Sliced and {{Radon Wasserstein Barycenters}} of {{Measures}}},
+  volume = {51},
+  issn = {0924-9907, 1573-7683},
+  url = {http://link.springer.com/10.1007/s10851-014-0506-3},
+  doi = {10.1007/s10851-014-0506-3},
+  number = {1},
+  journaltitle = {Journal of Mathematical Imaging and Vision},
+  shortjournal = {J Math Imaging Vis},
+  urldate = {2019-05-09},
+  date = {2015-01},
+  pages = {22-45},
+  author = {Bonneel, Nicolas and Rabin, Julien and Peyré, Gabriel and Pfister, Hanspeter},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/9SKGYDK4/bonneel2014.pdf}
+}
+
+@article{frognerLearningEmbeddingsEntropic2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.03329},
+  primaryClass = {cs, stat},
+  title = {Learning {{Embeddings}} into {{Entropic Wasserstein Spaces}}},
+  url = {http://arxiv.org/abs/1905.03329},
+  abstract = {Euclidean embeddings of data are fundamentally limited in their ability to capture latent semantic structures, which need not conform to Euclidean spatial assumptions. Here we consider an alternative, which embeds data as discrete probability distributions in a Wasserstein space, endowed with an optimal transport metric. Wasserstein spaces are much larger and more flexible than Euclidean spaces, in that they can successfully embed a wider variety of metric structures. We exploit this flexibility by learning an embedding that captures semantic information in the Wasserstein distance between embedded distributions. We examine empirically the representational capacity of our learned Wasserstein embeddings, showing that they can embed a wide variety of metric structures with smaller distortion than an equivalent Euclidean embedding. We also investigate an application to word embedding, demonstrating a unique advantage of Wasserstein embeddings: We can visualize the high-dimensional embedding directly, since it is a probability distribution on a low-dimensional space. This obviates the need for dimensionality reduction techniques like t-SNE for visualization.},
+  urldate = {2019-05-10},
+  date = {2019-05-08},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Frogner, Charlie and Mirzazadeh, Farzaneh and Solomon, Justin},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/UAEFXQR2/Frogner et al. - 2019 - Learning Embeddings into Entropic Wasserstein Spac.pdf;/home/dimitri/Nextcloud/Zotero/storage/AY98Y254/1905.html}
+}
+
+@incollection{cuturiSinkhornDistancesLightspeed2013,
+  title = {Sinkhorn {{Distances}}: {{Lightspeed Computation}} of {{Optimal Transport}}},
+  url = {http://papers.nips.cc/paper/4927-sinkhorn-distances-lightspeed-computation-of-optimal-transport.pdf},
+  shorttitle = {Sinkhorn {{Distances}}},
+  booktitle = {Advances in {{Neural Information Processing Systems}} 26},
+  publisher = {{Curran Associates, Inc.}},
+  urldate = {2019-05-10},
+  date = {2013},
+  pages = {2292--2300},
+  author = {Cuturi, Marco},
+  editor = {Burges, C. J. C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K. Q.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/ALCWJV4I/Cuturi - 2013 - Sinkhorn Distances Lightspeed Computation of Opti.pdf;/home/dimitri/Nextcloud/Zotero/storage/D2BFL994/4927-sinkhorn-distances-lightspeed-computation-of-optimal-transport.html}
+}
+
+@article{deriuSurveyEvaluationMethods2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.04071},
+  primaryClass = {cs},
+  title = {Survey on {{Evaluation Methods}} for {{Dialogue Systems}}},
+  url = {http://arxiv.org/abs/1905.04071},
+  abstract = {In this paper we survey the methods and concepts developed for the evaluation of dialogue systems. Evaluation is a crucial part during the development process. Often, dialogue systems are evaluated by means of human evaluations and questionnaires. However, this tends to be very cost and time intensive. Thus, much work has been put into finding methods, which allow to reduce the involvement of human labour. In this survey, we present the main concepts and methods. For this, we differentiate between the various classes of dialogue systems (task-oriented dialogue systems, conversational dialogue systems, and question-answering dialogue systems). We cover each class by introducing the main technologies developed for the dialogue systems and then by presenting the evaluation methods regarding this class.},
+  urldate = {2019-05-13},
+  date = {2019-05-10},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Human-Computer Interaction},
+  author = {Deriu, Jan and Rodrigo, Alvaro and Otegi, Arantxa and Echegoyen, Guillermo and Rosset, Sophie and Agirre, Eneko and Cieliebak, Mark},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/C3ELPHN2/Deriu et al. - 2019 - Survey on Evaluation Methods for Dialogue Systems.pdf;/home/dimitri/Nextcloud/Zotero/storage/EIMU5STH/1905.html}
+}
+
+@online{AINotesInitializing,
+  title = {{{AI Notes}}: {{Initializing}} Neural Networks},
+  url = {https://www.deeplearning.ai/ai-notes/initialization/},
+  shorttitle = {{{AI Notes}}},
+  abstract = {AI Notes: Initializing neural networks - deeplearning.ai},
+  journaltitle = {deeplearning.ai},
+  urldate = {2019-05-13},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/YS7UD2IB/initialization.html}
+}
+
+@article{serbanDeepReinforcementLearning2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1709.02349},
+  primaryClass = {cs, stat},
+  title = {A {{Deep Reinforcement Learning Chatbot}}},
+  url = {http://arxiv.org/abs/1709.02349},
+  abstract = {We present MILABOT: a deep reinforcement learning chatbot developed by the Montreal Institute for Learning Algorithms (MILA) for the Amazon Alexa Prize competition. MILABOT is capable of conversing with humans on popular small talk topics through both speech and text. The system consists of an ensemble of natural language generation and retrieval models, including template-based models, bag-of-words models, sequence-to-sequence neural network and latent variable neural network models. By applying reinforcement learning to crowdsourced data and real-world user interactions, the system has been trained to select an appropriate response from the models in its ensemble. The system has been evaluated through A/B testing with real-world users, where it performed significantly better than many competing systems. Due to its machine learning architecture, the system is likely to improve with additional data.},
+  urldate = {2019-05-14},
+  date = {2017-09-07},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,I.2.7,I.5.1},
+  author = {Serban, Iulian V. and Sankar, Chinnadhurai and Germain, Mathieu and Zhang, Saizheng and Lin, Zhouhan and Subramanian, Sandeep and Kim, Taesup and Pieper, Michael and Chandar, Sarath and Ke, Nan Rosemary and Rajeshwar, Sai and family=Brebisson, given=Alexandre, prefix=de, useprefix=true and Sotelo, Jose M. R. and Suhubdy, Dendi and Michalski, Vincent and Nguyen, Alexandre and Pineau, Joelle and Bengio, Yoshua},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/TC9VIFMX/Serban et al. - 2017 - A Deep Reinforcement Learning Chatbot.pdf;/home/dimitri/Nextcloud/Zotero/storage/F6BFAJSK/1709.html}
+}
+
+@article{shenOrderedNeuronsIntegrating2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1810.09536},
+  primaryClass = {cs},
+  title = {Ordered {{Neurons}}: {{Integrating Tree Structures}} into {{Recurrent Neural Networks}}},
+  url = {http://arxiv.org/abs/1810.09536},
+  shorttitle = {Ordered {{Neurons}}},
+  abstract = {Natural language is hierarchically structured: smaller units (e.g., phrases) are nested within larger units (e.g., clauses). When a larger constituent ends, all of the smaller constituents that are nested within it must also be closed. While the standard LSTM architecture allows different neurons to track information at different time scales, it does not have an explicit bias towards modeling a hierarchy of constituents. This paper proposes to add such an inductive bias by ordering the neurons; a vector of master input and forget gates ensures that when a given neuron is updated, all the neurons that follow it in the ordering are also updated. Our novel recurrent architecture, ordered neurons LSTM (ON-LSTM), achieves good performance on four different tasks: language modeling, unsupervised parsing, targeted syntactic evaluation, and logical inference.},
+  urldate = {2019-05-14},
+  date = {2018-10-22},
+  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Shen, Yikang and Tan, Shawn and Sordoni, Alessandro and Courville, Aaron},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/NEEU5N2C/Shen et al. - 2018 - Ordered Neurons Integrating Tree Structures into .pdf;/home/dimitri/Nextcloud/Zotero/storage/SASXRX76/1810.html}
+}
+
+@article{kawaguchiGeneralizationDeepLearning2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1710.05468},
+  primaryClass = {cs, stat},
+  title = {Generalization in {{Deep Learning}}},
+  url = {http://arxiv.org/abs/1710.05468},
+  abstract = {This paper provides non-vacuous and numerically-tight generalization guarantees for deep learning, as well as theoretical insights into why and how deep learning can generalize well, despite its large capacity, complexity, possible algorithmic instability, nonrobustness, and sharp minima, responding to an open question in the literature. We also propose new open problems and discuss the limitations of our results.},
+  urldate = {2019-05-14},
+  date = {2017-10-15},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
+  author = {Kawaguchi, Kenji and Kaelbling, Leslie Pack and Bengio, Yoshua},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/IGRS7AC2/Kawaguchi et al. - 2017 - Generalization in Deep Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/TXV7FXKZ/1710.html}
+}
+
+@article{vayerOptimalTransportStructured2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1805.09114},
+  primaryClass = {cs, stat},
+  title = {Optimal {{Transport}} for Structured Data with Application on Graphs},
+  url = {http://arxiv.org/abs/1805.09114},
+  abstract = {This work considers the problem of computing distances between structured objects such as undirected graphs, seen as probability distributions in a specific metric space. We consider a new transportation distance (i.e. that minimizes a total cost of transporting probability masses) that unveils the geometric nature of the structured objects space. Unlike Wasserstein or Gromov-Wasserstein metrics that focus solely and respectively on features (by considering a metric in the feature space) or structure (by seeing structure as a metric space), our new distance exploits jointly both information, and is consequently called Fused Gromov-Wasserstein (FGW). After discussing its properties and computational aspects, we show results on a graph classification task, where our method outperforms both graph kernels and deep graph convolutional networks. Exploiting further on the metric properties of FGW, interesting geometric objects such as Fr\textbackslash{}'echet means or barycenters of graphs are illustrated and discussed in a clustering context.},
+  urldate = {2019-05-14},
+  date = {2018-05-23},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Vayer, Titouan and Chapel, Laetitia and Flamary, Rémi and Tavenard, Romain and Courty, Nicolas},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/8KCFRIEK/Vayer et al. - 2018 - Optimal Transport for structured data with applica.pdf;/home/dimitri/Nextcloud/Zotero/storage/HS3SLU6S/1805.html}
+}
+
+@article{frankleLotteryTicketHypothesis2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1803.03635},
+  primaryClass = {cs},
+  title = {The {{Lottery Ticket Hypothesis}}: {{Finding Sparse}}, {{Trainable Neural Networks}}},
+  url = {http://arxiv.org/abs/1803.03635},
+  shorttitle = {The {{Lottery Ticket Hypothesis}}},
+  abstract = {Neural network pruning techniques can reduce the parameter counts of trained networks by over 90\%, decreasing storage requirements and improving computational performance of inference without compromising accuracy. However, contemporary experience is that the sparse architectures produced by pruning are difficult to train from the start, which would similarly improve training performance. We find that a standard pruning technique naturally uncovers subnetworks whose initializations made them capable of training effectively. Based on these results, we articulate the "lottery ticket hypothesis:" dense, randomly-initialized, feed-forward networks contain subnetworks ("winning tickets") that - when trained in isolation - reach test accuracy comparable to the original network in a similar number of iterations. The winning tickets we find have won the initialization lottery: their connections have initial weights that make training particularly effective. We present an algorithm to identify winning tickets and a series of experiments that support the lottery ticket hypothesis and the importance of these fortuitous initializations. We consistently find winning tickets that are less than 10-20\% of the size of several fully-connected and convolutional feed-forward architectures for MNIST and CIFAR10. Above this size, the winning tickets that we find learn faster than the original network and reach higher test accuracy.},
+  urldate = {2019-05-15},
+  date = {2018-03-09},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
+  author = {Frankle, Jonathan and Carbin, Michael},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/AK4564LE/Frankle and Carbin - 2018 - The Lottery Ticket Hypothesis Finding Sparse, Tra.pdf;/home/dimitri/Nextcloud/Zotero/storage/P7E9CAS7/1803.html}
+}
+
+@article{zhouDeconstructingLotteryTickets2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.01067},
+  primaryClass = {cs, stat},
+  title = {Deconstructing {{Lottery Tickets}}: {{Zeros}}, {{Signs}}, and the {{Supermask}}},
+  url = {http://arxiv.org/abs/1905.01067},
+  shorttitle = {Deconstructing {{Lottery Tickets}}},
+  abstract = {The recent "Lottery Ticket Hypothesis" paper by Frankle \& Carbin showed that a simple approach to creating sparse networks (keep the large weights) results in models that are trainable from scratch, but only when starting from the same initial weights. The performance of these networks often exceeds the performance of the non-sparse base model, but for reasons that were not well understood. In this paper we study the three critical components of the Lottery Ticket (LT) algorithm, showing that each may be varied significantly without impacting the overall results. Ablating these factors leads to new insights for why LT networks perform as well as they do. We show why setting weights to zero is important, how signs are all you need to make the re-initialized network train, and why masking behaves like training. Finally, we discover the existence of Supermasks, or masks that can be applied to an untrained, randomly initialized network to produce a model with performance far better than chance (86\% on MNIST, 41\% on CIFAR-10).},
+  urldate = {2019-05-15},
+  date = {2019-05-03},
+  keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Zhou, Hattie and Lan, Janice and Liu, Rosanne and Yosinski, Jason},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/8FFPAI2U/Zhou et al. - 2019 - Deconstructing Lottery Tickets Zeros, Signs, and .pdf;/home/dimitri/Nextcloud/Zotero/storage/RFK86THM/1905.html}
+}
+
+@article{hesterDeepQlearningDemonstrations2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1704.03732},
+  primaryClass = {cs},
+  title = {Deep {{Q}}-Learning from {{Demonstrations}}},
+  url = {http://arxiv.org/abs/1704.03732},
+  abstract = {Deep reinforcement learning (RL) has achieved several high profile successes in difficult decision-making problems. However, these algorithms typically require a huge amount of data before they reach reasonable performance. In fact, their performance during learning can be extremely poor. This may be acceptable for a simulator, but it severely limits the applicability of deep RL to many real-world tasks, where the agent must learn in the real environment. In this paper we study a setting where the agent may access data from previous control of the system. We present an algorithm, Deep Q-learning from Demonstrations (DQfD), that leverages small sets of demonstration data to massively accelerate the learning process even from relatively small amounts of demonstration data and is able to automatically assess the necessary ratio of demonstration data while learning thanks to a prioritized replay mechanism. DQfD works by combining temporal difference updates with supervised classification of the demonstrator's actions. We show that DQfD has better initial performance than Prioritized Dueling Double Deep Q-Networks (PDD DQN) as it starts with better scores on the first million steps on 41 of 42 games and on average it takes PDD DQN 83 million steps to catch up to DQfD's performance. DQfD learns to out-perform the best demonstration given in 14 of 42 games. In addition, DQfD leverages human demonstrations to achieve state-of-the-art results for 11 games. Finally, we show that DQfD performs better than three related algorithms for incorporating demonstration data into DQN.},
+  urldate = {2019-05-16},
+  date = {2017-04-12},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+  author = {Hester, Todd and Vecerik, Matej and Pietquin, Olivier and Lanctot, Marc and Schaul, Tom and Piot, Bilal and Horgan, Dan and Quan, John and Sendonaris, Andrew and Dulac-Arnold, Gabriel and Osband, Ian and Agapiou, John and Leibo, Joel Z. and Gruslys, Audrunas},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/77YC9ZHU/Hester et al. - 2017 - Deep Q-learning from Demonstrations.pdf;/home/dimitri/Nextcloud/Zotero/storage/M7LNGZEI/1704.html}
+}
+
+@article{fanHierarchicalNeuralStory2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1805.04833},
+  primaryClass = {cs},
+  title = {Hierarchical {{Neural Story Generation}}},
+  url = {http://arxiv.org/abs/1805.04833},
+  abstract = {We explore story generation: creative systems that can build coherent and fluent passages of text about a topic. We collect a large dataset of 300K human-written stories paired with writing prompts from an online forum. Our dataset enables hierarchical story generation, where the model first generates a premise, and then transforms it into a passage of text. We gain further improvements with a novel form of model fusion that improves the relevance of the story to the prompt, and adding a new gated multi-scale self-attention mechanism to model long-range context. Experiments show large improvements over strong baselines on both automated and human evaluations. Human judges prefer stories generated by our approach to those from a strong non-hierarchical model by a factor of two to one.},
+  urldate = {2019-05-16},
+  date = {2018-05-13},
+  keywords = {Computer Science - Computation and Language},
+  author = {Fan, Angela and Lewis, Mike and Dauphin, Yann},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/3DCR53RA/Fan et al. - 2018 - Hierarchical Neural Story Generation.pdf;/home/dimitri/Nextcloud/Zotero/storage/MKM4L4SD/1805.html}
+}
+
+@article{mnihHumanlevelControlDeep2015,
+  langid = {english},
+  title = {Human-Level Control through Deep Reinforcement Learning},
+  volume = {518},
+  issn = {0028-0836, 1476-4687},
+  url = {http://www.nature.com/articles/nature14236},
+  doi = {10.1038/nature14236},
+  number = {7540},
+  journaltitle = {Nature},
+  shortjournal = {Nature},
+  urldate = {2019-05-17},
+  date = {2015-02},
+  pages = {529-533},
+  author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/PN6LQZVH/Mnih et al. - 2015 - Human-level control through deep reinforcement lea.pdf}
+}
+
+@article{silverMasteringGameGo2016,
+  langid = {english},
+  title = {Mastering the Game of {{Go}} with Deep Neural Networks and Tree Search},
+  volume = {529},
+  issn = {0028-0836, 1476-4687},
+  url = {http://www.nature.com/articles/nature16961},
+  doi = {10.1038/nature16961},
+  number = {7587},
+  journaltitle = {Nature},
+  shortjournal = {Nature},
+  urldate = {2019-05-17},
+  date = {2016-01},
+  pages = {484-489},
+  author = {Silver, David and Huang, Aja and Maddison, Chris J. and Guez, Arthur and Sifre, Laurent and family=Driessche, given=George, prefix=van den, useprefix=true and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and Dieleman, Sander and Grewe, Dominik and Nham, John and Kalchbrenner, Nal and Sutskever, Ilya and Lillicrap, Timothy and Leach, Madeleine and Kavukcuoglu, Koray and Graepel, Thore and Hassabis, Demis},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/EE74MJVF/Silver et al. - 2016 - Mastering the game of Go with deep neural networks.pdf}
+}
+
+@incollection{cohen-addadHierarchicalClusteringObjective2018,
+  title = {Hierarchical {{Clustering}}: {{Objective Functions}} and {{Algorithms}}},
+  url = {https://epubs.siam.org/doi/10.1137/1.9781611975031.26},
+  shorttitle = {Hierarchical {{Clustering}}},
+  abstract = {Hierarchical clustering is a recursive partitioning of a dataset into clusters at an increasingly finer granularity. Motivated by the fact that most work on hierarchical clustering was based on providing algorithms, rather than optimizing a specific objective, [19] framed similarity-based hierarchical clustering as a combinatorial optimization problem, where a ‘good’ hierarchical clustering is one that minimizes some cost function. He showed that this cost function has certain desirable properties, such as in order to achieve optimal cost, disconnected components must be separated first and that in ‘structureless’ graphs, i.e., cliques, all clusterings achieve the same cost. We take an axiomatic approach to defining ‘good’ objective functions for both similarity and dissimilarity-based hierarchical clustering. We characterize a set of admissible objective functions (that includes the one introduced by Dasgupta) that have the property that when the input admits a ‘natural’ ground-truth hierarchical clustering, the ground-truth clustering has an optimal value. Equipped with a suitable objective function, we analyze the performance of practical algorithms, as well as develop better and faster algorithms for hierarchical clustering. For similarity-based hierarchical clustering, [19] showed that a simple recursive sparsest-cut based approach achieves an O(log3/2 n)-approximation on worst-case inputs. We give a more refined analysis of the algorithm and show that it in fact achieves an -approximation1. This improves upon the LP-based O(log n)-approximation of [33]. For dissimilarity-based hierarchical clustering, we show that the classic average-linkage algorithm gives a factor 2 approximation, and provide a simple and better algorithm that gives a factor 3/2 approximation. This aims at explaining the success of these heuristics in practice. Finally, we consider a ‘beyond-worst-case’ scenario through a generalisation of the stochastic block model for hierarchical clustering. We show that Dasgupta's cost function also has desirable properties for these inputs and we provide a simple algorithm that for graphs generated according to this model yields a 1 + o(1) factor approximation.},
+  volumes = {0},
+  booktitle = {Proceedings of the {{Twenty}}-{{Ninth Annual ACM}}-{{SIAM Symposium}} on {{Discrete Algorithms}}},
+  series = {Proceedings},
+  publisher = {{Society for Industrial and Applied Mathematics}},
+  urldate = {2019-05-17},
+  date = {2018-01-01},
+  pages = {378-397},
+  author = {Cohen-Addad, V. and Kanade, V. and Mallmann-Trenn, F. and Mathieu, C.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/VHDL9VUB/Cohen-Addad et al. - 2018 - Hierarchical Clustering Objective Functions and A.pdf;/home/dimitri/Nextcloud/Zotero/storage/EJTTUVNP/1.9781611975031.html},
+  doi = {10.1137/1.9781611975031.26}
+}
+
+@inproceedings{thomasHighConfidenceOffpolicy2015,
+  title = {High {{Confidence Off}}-Policy {{Evaluation}}},
+  isbn = {978-0-262-51129-2},
+  url = {https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewPaper/10042},
+  abstract = {Many reinforcement learning algorithms use trajectories collected from the execution of one or more policies to propose a new policy. Because execution of a bad policy can be costly or dangerous, techniques for evaluating the performance of the new policy without requiring its execution have been of recent interest in industry. Such off-policy evaluation methods, which estimate the performance of a policy using trajectories collected from the execution of other policies, heretofore have not provided confidences regarding the accuracy of their estimates. In this paper we propose an off-policy method for computing a lower confidence bound on the expected return of a policy.},
+  booktitle = {Proceedings of the {{Twenty}}-{{Ninth AAAI Conference}} on {{Artificial Intelligence}}},
+  series = {{{AAAI}}'15},
+  publisher = {{AAAI Press}},
+  urldate = {2019-05-17},
+  date = {2015},
+  pages = {3000--3006},
+  author = {Thomas, Philip S. and Theocharous, Georgios and Ghavamzadeh, Mohammad},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/DSD86HE9/Thomas et al. - 2015 - High Confidence Off-policy Evaluation.pdf},
+  venue = {Austin, Texas}
+}
+
+@inproceedings{thomasDataEfficientOffPolicyPolicy2016,
+  langid = {english},
+  title = {Data-{{Efficient Off}}-{{Policy Policy Evaluation}} for {{Reinforcement Learning}}},
+  url = {http://proceedings.mlr.press/v48/thomasa16.html},
+  abstract = {In this paper we present a new way of predicting the performance of a reinforcement learning policy given historical data that may have been generated by a different policy. The ability to evaluate...},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  booktitle = {International {{Conference}} on {{Machine Learning}}},
+  urldate = {2019-05-17},
+  date = {2016-06-11},
+  pages = {2139-2148},
+  author = {Thomas, Philip and Brunskill, Emma},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/KRD885VU/Thomas and Brunskill - 2016 - Data-Efficient Off-Policy Policy Evaluation for Re.pdf;/home/dimitri/Nextcloud/Zotero/storage/X2R64R83/Appendix.pdf;/home/dimitri/Nextcloud/Zotero/storage/YPJPLZJ6/thomasa16.html}
+}
+
+@inproceedings{farajtabarMoreRobustDoubly2018,
+  langid = {english},
+  title = {More {{Robust Doubly Robust Off}}-Policy {{Evaluation}}},
+  url = {http://proceedings.mlr.press/v80/farajtabar18a.html},
+  abstract = {We study the problem of off-policy evaluation (OPE) in reinforcement learning (RL), where the goal is to estimate the performance of a policy from the data generated by another policy(ies). In part...},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  booktitle = {International {{Conference}} on {{Machine Learning}}},
+  urldate = {2019-05-17},
+  date = {2018-07-03},
+  pages = {1447-1456},
+  author = {Farajtabar, Mehrdad and Chow, Yinlam and Ghavamzadeh, Mohammad},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/NJK9Q7YI/Farajtabar et al. - 2018 - More Robust Doubly Robust Off-policy Evaluation.pdf;/home/dimitri/Nextcloud/Zotero/storage/XYSG7MCZ/farajtabar18a.html}
+}
+
+@inproceedings{jiangDoublyRobustOffpolicy2016,
+  langid = {english},
+  title = {Doubly {{Robust Off}}-Policy {{Value Evaluation}} for {{Reinforcement Learning}}},
+  url = {http://proceedings.mlr.press/v48/jiang16.html},
+  abstract = {We study the problem of off-policy value evaluation in reinforcement learning (RL), where one aims to estimate the value of a new policy based on data collected by a different policy. This problem ...},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  booktitle = {International {{Conference}} on {{Machine Learning}}},
+  urldate = {2019-05-17},
+  date = {2016-06-11},
+  pages = {652-661},
+  author = {Jiang, Nan and Li, Lihong},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/6B6SSYEQ/jiang16-supp.pdf;/home/dimitri/Nextcloud/Zotero/storage/IBUZU3LH/Jiang and Li - 2016 - Doubly Robust Off-policy Value Evaluation for Rein.pdf;/home/dimitri/Nextcloud/Zotero/storage/XRBVCV63/jiang16.html}
+}
+
+@article{mazarDishonestyEverydayLife2006,
+  langid = {english},
+  title = {Dishonesty in {{Everyday Life}} and {{Its Policy Implications}}},
+  volume = {25},
+  issn = {0748-6766, 1547-7207},
+  url = {http://journals.sagepub.com/doi/10.1509/jppm.25.1.117},
+  doi = {10.1509/jppm.25.1.117},
+  number = {1},
+  journaltitle = {Journal of Public Policy \& Marketing},
+  shortjournal = {Journal of Public Policy \& Marketing},
+  urldate = {2019-05-17},
+  date = {2006-04},
+  pages = {117-126},
+  author = {Mazar, Nina and Ariely, Dan},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/6NM8HENU/Mazar_Ariely_2006_Dishonesty in Everyday Life and Its Policy Implications.pdf}
+}
+
+@article{nagelWhatItBe1974,
+  title = {What {{Is It Like}} to {{Be}} a {{Bat}}?},
+  volume = {83},
+  issn = {00318108},
+  url = {https://www.jstor.org/stable/2183914?origin=crossref},
+  doi = {10.2307/2183914},
+  number = {4},
+  journaltitle = {The Philosophical Review},
+  shortjournal = {The Philosophical Review},
+  urldate = {2019-05-17},
+  date = {1974-10},
+  pages = {435},
+  author = {Nagel, Thomas},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/93ZKZZFZ/Nagel_1974_What Is It Like to Be a Bat.pdf}
+}
+
+@article{kaufmannImaginationArtificielle1969,
+  langid = {french},
+  title = {L'imagination artificielle},
+  volume = {3},
+  issn = {0399-0559},
+  url = {https://eudml.org/doc/104480},
+  number = {V3},
+  journaltitle = {RAIRO - Operations Research - Recherche Opérationnelle},
+  urldate = {2019-05-20},
+  date = {1969},
+  pages = {5-24},
+  author = {Kaufmann, A.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/GG5TMAJJ/Kaufmann - 1969 - L'imagination artificielle.pdf;/home/dimitri/Nextcloud/Zotero/storage/INXU95SE/104480.html}
+}
+
+@article{swingerWhatAreBiases2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1812.08769},
+  primaryClass = {cs},
+  title = {What Are the Biases in My Word Embedding?},
+  url = {http://arxiv.org/abs/1812.08769},
+  abstract = {This paper presents an algorithm for enumerating biases in word embeddings. The algorithm exposes a large number of offensive associations related to sensitive features such as race and gender on publicly available embeddings, including a supposedly "debiased" embedding. These biases are concerning in light of the widespread use of word embeddings. The associations are identified by geometric patterns in word embeddings that run parallel between people's names and common lower-case tokens. The algorithm is highly unsupervised: it does not even require the sensitive features to be pre-specified. This is desirable because: (a) many forms of discrimination--such as racial discrimination--are linked to social constructs that may vary depending on the context, rather than to categories with fixed definitions; and (b) it makes it easier to identify biases against intersectional groups, which depend on combinations of sensitive features. The inputs to our algorithm are a list of target tokens, e.g. names, and a word embedding. It outputs a number of Word Embedding Association Tests (WEATs) that capture various biases present in the data. We illustrate the utility of our approach on publicly available word embeddings and lists of names, and evaluate its output using crowdsourcing. We also show how removing names may not remove potential proxy bias.},
+  urldate = {2019-05-21},
+  date = {2018-12-20},
+  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Swinger, Nathaniel and De-Arteaga, Maria and Heffernan IV, Neil Thomas and Leiserson, Mark DM and Kalai, Adam Tauman},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/48KWSJ5B/Swinger et al. - 2018 - What are the biases in my word embedding.pdf;/home/dimitri/Nextcloud/Zotero/storage/GLJK7K6P/1812.html}
+}
+
+@article{goldtStochasticThermodynamicsLearning2017,
+  langid = {english},
+  title = {Stochastic {{Thermodynamics}} of {{Learning}}},
+  volume = {118},
+  issn = {0031-9007, 1079-7114},
+  url = {https://link.aps.org/doi/10.1103/PhysRevLett.118.010601},
+  doi = {10.1103/PhysRevLett.118.010601},
+  number = {1},
+  journaltitle = {Physical Review Letters},
+  shortjournal = {Phys. Rev. Lett.},
+  urldate = {2019-05-22},
+  date = {2017-01-06},
+  pages = {010601},
+  author = {Goldt, Sebastian and Seifert, Udo},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/VZIXU9EB/goldt2017.pdf;/home/dimitri/Nextcloud/Zotero/storage/ZS4FRRGR/Goldt and Seifert - 2017 - Stochastic Thermodynamics of Learning.pdf}
+}
+
+@article{alemiTherMLThermodynamicsMachine2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1807.04162},
+  primaryClass = {cond-mat, stat},
+  title = {{{TherML}}: {{Thermodynamics}} of {{Machine Learning}}},
+  url = {http://arxiv.org/abs/1807.04162},
+  shorttitle = {{{TherML}}},
+  abstract = {In this work we offer a framework for reasoning about a wide class of existing objectives in machine learning. We develop a formal correspondence between this work and thermodynamics and discuss its implications.},
+  urldate = {2019-05-22},
+  date = {2018-07-11},
+  keywords = {Statistics - Machine Learning,Condensed Matter - Statistical Mechanics,Computer Science - Machine Learning},
+  author = {Alemi, Alexander A. and Fischer, Ian},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/XQJKMW9R/Alemi and Fischer - 2018 - TherML Thermodynamics of Machine Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/4B76X7GC/1807.html}
+}
+
+@article{lloydComplexityThermodynamicDepth1988,
+  langid = {english},
+  title = {Complexity as Thermodynamic Depth},
+  volume = {188},
+  issn = {00034916},
+  url = {https://linkinghub.elsevier.com/retrieve/pii/0003491688900942},
+  doi = {10.1016/0003-4916(88)90094-2},
+  number = {1},
+  journaltitle = {Annals of Physics},
+  shortjournal = {Annals of Physics},
+  urldate = {2019-05-22},
+  date = {1988-11},
+  pages = {186-213},
+  author = {Lloyd, Seth and Pagels, Heinz},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/QRIK24NG/Lloyd and Pagels - 1988 - Complexity as thermodynamic depth.pdf}
+}
+
+@article{graysonIntroductionUnivalentFoundations2018,
+  langid = {english},
+  title = {An Introduction to Univalent Foundations for Mathematicians},
+  volume = {55},
+  issn = {0273-0979, 1088-9485},
+  url = {http://www.ams.org/bull/2018-55-04/S0273-0979-2018-01616-9/},
+  doi = {10.1090/bull/1616},
+  number = {4},
+  journaltitle = {Bulletin of the American Mathematical Society},
+  shortjournal = {Bull. Amer. Math. Soc.},
+  urldate = {2019-05-23},
+  date = {2018-03-05},
+  pages = {427-450},
+  author = {Grayson, Daniel R.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/T6445HRJ/Grayson - 2018 - An introduction to univalent foundations for mathe.pdf}
+}
+
+@article{wietingNoTrainingRequired2019a,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1901.10444},
+  primaryClass = {cs},
+  title = {No {{Training Required}}: {{Exploring Random Encoders}} for {{Sentence Classification}}},
+  url = {http://arxiv.org/abs/1901.10444},
+  shorttitle = {No {{Training Required}}},
+  abstract = {We explore various methods for computing sentence representations from pre-trained word embeddings without any training, i.e., using nothing but random parameterizations. Our aim is to put sentence embeddings on more solid footing by 1) looking at how much modern sentence embeddings gain over random methods---as it turns out, surprisingly little; and by 2) providing the field with more appropriate baselines going forward---which are, as it turns out, quite strong. We also make important observations about proper experimental protocol for sentence classification evaluation, together with recommendations for future research.},
+  urldate = {2019-05-24},
+  date = {2019-01-29},
+  keywords = {Computer Science - Computation and Language},
+  author = {Wieting, John and Kiela, Douwe},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/PSEYU45U/Wieting and Kiela - 2019 - No Training Required Exploring Random Encoders fo.pdf;/home/dimitri/Nextcloud/Zotero/storage/2CP6PMIW/1901.html}
+}
+
+@article{saxeMathematicalTheorySemantic2019,
+  langid = {english},
+  title = {A Mathematical Theory of Semantic Development in Deep Neural Networks},
+  issn = {0027-8424, 1091-6490},
+  url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1820226116},
+  doi = {10.1073/pnas.1820226116},
+  abstract = {An extensive body of empirical research has revealed remarkable regularities in the acquisition, organization, deployment, and neural representation of human semantic knowledge, thereby raising a fundamental conceptual question: What are the theoretical principles governing the ability of neural networks to acquire, organize, and deploy abstract knowledge by integrating across many individual experiences? We address this question by mathematically analyzing the nonlinear dynamics of learning in deep linear networks. We find exact solutions to this learning dynamics that yield a conceptual explanation for the prevalence of many disparate phenomena in semantic cognition, including the hierarchical differentiation of concepts through rapid developmental transitions, the ubiquity of semantic illusions between such transitions, the emergence of item typicality and category coherence as factors controlling the speed of semantic processing, changing patterns of inductive projection over development, and the conservation of semantic similarity in neural representations across species. Thus, surprisingly, our simple neural model qualitatively recapitulates many diverse regularities underlying semantic development, while providing analytic insight into how the statistical structure of an environment can interact with nonlinear deep-learning dynamics to give rise to these regularities.},
+  journaltitle = {Proceedings of the National Academy of Sciences},
+  shortjournal = {Proc Natl Acad Sci USA},
+  urldate = {2019-05-24},
+  date = {2019-05-17},
+  pages = {201820226},
+  author = {Saxe, Andrew M. and McClelland, James L. and Ganguli, Surya},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/QU842KYR/Saxe et al. - 2019 - A mathematical theory of semantic development in d.pdf}
+}
+
+@article{ben-davidLearnabilityCanBe2019,
+  langid = {english},
+  title = {Learnability Can Be Undecidable},
+  volume = {1},
+  issn = {2522-5839},
+  url = {https://www.nature.com/articles/s42256-018-0002-3},
+  doi = {10.1038/s42256-018-0002-3},
+  abstract = {Not all mathematical questions can be resolved, according to Gödel’s famous incompleteness theorems. It turns out that machine learning can be vulnerable to undecidability too, as is illustrated with an example problem where learnability cannot be proved nor refuted.},
+  number = {1},
+  journaltitle = {Nature Machine Intelligence},
+  urldate = {2019-05-24},
+  date = {2019-01},
+  pages = {44},
+  author = {Ben-David, Shai and Hrubeš, Pavel and Moran, Shay and Shpilka, Amir and Yehudayoff, Amir},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/VD6BTBEG/Ben-David et al. - 2019 - Learnability can be undecidable.pdf;/home/dimitri/Nextcloud/Zotero/storage/K9RU5GU4/s42256-018-0002-3.html}
+}
+
+@book{theunivalentfoundationsprogramHomotopyTypeTheory2013,
+  location = {{Institute for Advanced Study}},
+  title = {Homotopy {{Type Theory}}: {{Univalent Foundations}} of {{Mathematics}}},
+  url = {https://homotopytypetheory.org/book/},
+  shorttitle = {Homotopy {{Type Theory}}},
+  date = {2013},
+  author = {The Univalent Foundations Program},
+  note = {Open Library ID: OL25428110M}
+}
+
+@article{vayerSlicedGromovWasserstein2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.10124},
+  primaryClass = {cs, stat},
+  title = {Sliced {{Gromov}}-{{Wasserstein}}},
+  url = {http://arxiv.org/abs/1905.10124},
+  abstract = {Recently used in various machine learning contexts, the Gromov-Wasserstein distance (GW) allows for comparing distributions that do not necessarily lie in the same metric space. However, this Optimal Transport (OT) distance requires solving a complex non convex quadratic program which is most of the time very costly both in time and memory. Contrary to GW, the Wasserstein distance (W) enjoys several properties (e.g. duality) that permit large scale optimization. Among those, the Sliced Wasserstein (SW) distance exploits the direct solution of W on the line, that only requires sorting discrete samples in 1D. This paper propose a new divergence based on GW akin to SW. We first derive a closed form for GW when dealing with 1D distributions, based on a new result for the related quadratic assignment problem. We then define a novel OT discrepancy that can deal with large scale distributions via a slicing approach and we show how it relates to the GW distance while being \$O(n\^2)\$ to compute. We illustrate the behavior of this so called Sliced Gromov-Wasserstein (SGW) discrepancy in experiments where we demonstrate its ability to tackle similar problems as GW while being several order of magnitudes faster to compute},
+  urldate = {2019-05-29},
+  date = {2019-05-24},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Vayer, Titouan and Flamary, Rémi and Tavenard, Romain and Chapel, Laetitia and Courty, Nicolas},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/8ZGBPMIP/Vayer et al. - 2019 - Sliced Gromov-Wasserstein.pdf;/home/dimitri/Nextcloud/Zotero/storage/EIJKFLT5/1905.html}
+}
+
+@article{zhangERNIEEnhancedLanguage2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.07129},
+  primaryClass = {cs},
+  title = {{{ERNIE}}: {{Enhanced Language Representation}} with {{Informative Entities}}},
+  url = {http://arxiv.org/abs/1905.07129},
+  shorttitle = {{{ERNIE}}},
+  abstract = {Neural language representation models such as BERT pre-trained on large-scale corpora can well capture rich semantic patterns from plain text, and be fine-tuned to consistently improve the performance of various NLP tasks. However, the existing pre-trained language models rarely consider incorporating knowledge graphs (KGs), which can provide rich structured knowledge facts for better language understanding. We argue that informative entities in KGs can enhance language representation with external knowledge. In this paper, we utilize both large-scale textual corpora and KGs to train an enhanced language representation model (ERNIE), which can take full advantage of lexical, syntactic, and knowledge information simultaneously. The experimental results have demonstrated that ERNIE achieves significant improvements on various knowledge-driven tasks, and meanwhile is comparable with the state-of-the-art model BERT on other common NLP tasks. The source code of this paper can be obtained from https://github.com/thunlp/ERNIE.},
+  urldate = {2019-05-29},
+  date = {2019-05-17},
+  keywords = {Computer Science - Computation and Language},
+  author = {Zhang, Zhengyan and Han, Xu and Liu, Zhiyuan and Jiang, Xin and Sun, Maosong and Liu, Qun},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/9IB4FYL3/Zhang et al. - 2019 - ERNIE Enhanced Language Representation with Infor.pdf;/home/dimitri/Nextcloud/Zotero/storage/5UXV3APF/1905.html}
+}
+
+@book{ambrosioGradientFlowsMetric2008,
+  langid = {english},
+  location = {{Basel}},
+  title = {Gradient Flows in Metric Spaces and in the Space of Probability Measures},
+  edition = {2. ed},
+  isbn = {978-3-7643-8722-8 978-3-7643-8721-1},
+  pagetotal = {334},
+  series = {Lectures in Mathematics {{ETH Zürich}}},
+  publisher = {{Birkhäuser}},
+  date = {2008},
+  author = {Ambrosio, Luigi and Gigli, Nicola and Savaré, Giuseppe},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/AJXKRTAG/gradient-flows-2008.pdf},
+  note = {OCLC: 254181287}
+}
+
+@article{dyerRecurrentNeuralNetwork2016,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1602.07776},
+  primaryClass = {cs},
+  title = {Recurrent {{Neural Network Grammars}}},
+  url = {http://arxiv.org/abs/1602.07776},
+  abstract = {We introduce recurrent neural network grammars, probabilistic models of sentences with explicit phrase structure. We explain efficient inference procedures that allow application to both parsing and language modeling. Experiments show that they provide better parsing in English than any single previously published supervised generative model and better language modeling than state-of-the-art sequential RNNs in English and Chinese.},
+  urldate = {2019-06-03},
+  date = {2016-02-24},
+  keywords = {Computer Science - Computation and Language,Computer Science - Neural and Evolutionary Computing},
+  author = {Dyer, Chris and Kuncoro, Adhiguna and Ballesteros, Miguel and Smith, Noah A.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/WQVMS2ZL/Dyer et al. - 2016 - Recurrent Neural Network Grammars.pdf;/home/dimitri/Nextcloud/Zotero/storage/G457GBIL/1602.html}
+}
+
+@article{bruel-gabrielssonTopologyLayerMachine2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.12200},
+  primaryClass = {cs, stat},
+  title = {A {{Topology Layer}} for {{Machine Learning}}},
+  url = {http://arxiv.org/abs/1905.12200},
+  abstract = {Topology applied to real world data using persistent homology has started to find applications within machine learning, including deep learning. We present a differentiable topology layer that computes persistent homology based on level set filtrations and distance-bases filtrations. We present three novel applications: the topological layer can (i) serve as a regularizer directly on data or the weights of machine learning models, (ii) construct a loss on the output of a deep generative network to incorporate topological priors, and (iii) perform topological adversarial attacks on deep networks trained with persistence features. The code is publicly available and we hope its availability will facilitate the use of persistent homology in deep learning and other gradient based applications.},
+  urldate = {2019-06-03},
+  date = {2019-05-28},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Brüel-Gabrielsson, Rickard and Nelson, Bradley J. and Dwaraknath, Anjan and Skraba, Primoz and Guibas, Leonidas J. and Carlsson, Gunnar},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/M2QTLMQG/Brüel-Gabrielsson et al. - 2019 - A Topology Layer for Machine Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/S6DIBJTQ/1905.html}
+}
+
+@article{tangTargetGuidedOpenDomainConversation2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.11553},
+  primaryClass = {cs},
+  title = {Target-{{Guided Open}}-{{Domain Conversation}}},
+  url = {http://arxiv.org/abs/1905.11553},
+  abstract = {Many real-world open-domain conversation applications have specific goals to achieve during open-ended chats, such as recommendation, psychotherapy, education, etc. We study the problem of imposing conversational goals on open-domain chat agents. In particular, we want a conversational system to chat naturally with human and proactively guide the conversation to a designated target subject. The problem is challenging as no public data is available for learning such a target-guided strategy. We propose a structured approach that introduces coarse-grained keywords to control the intended content of system responses. We then attain smooth conversation transition through turn-level supervised learning, and drive the conversation towards the target with discourse-level constraints. We further derive a keyword-augmented conversation dataset for the study. Quantitative and human evaluations show our system can produce meaningful and effective conversations, significantly improving over other approaches.},
+  urldate = {2019-06-03},
+  date = {2019-05-27},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Tang, Jianheng and Zhao, Tiancheng and Xiong, Chenyan and Liang, Xiaodan and Xing, Eric P. and Hu, Zhiting},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/IL2RB6MX/Tang et al. - 2019 - Target-Guided Open-Domain Conversation.pdf;/home/dimitri/Nextcloud/Zotero/storage/LUZRZYFE/1905.html}
+}
+
+@article{jaderbergHumanlevelPerformance3D2019,
+  langid = {english},
+  title = {Human-Level Performance in {{3D}} Multiplayer Games with Population-Based Reinforcement Learning},
+  volume = {364},
+  issn = {0036-8075, 1095-9203},
+  url = {https://science.sciencemag.org/content/364/6443/859},
+  doi = {10.1126/science.aau6249},
+  abstract = {Artificial teamwork
+Artificially intelligent agents are getting better and better at two-player games, but most real-world endeavors require teamwork. Jaderberg et al. designed a computer program that excels at playing the video game Quake III Arena in Capture the Flag mode, where two multiplayer teams compete in capturing the flags of the opposing team. The agents were trained by playing thousands of games, gradually learning successful strategies not unlike those favored by their human counterparts. Computer agents competed successfully against humans even when their reaction times were slowed to match those of humans.
+Science, this issue p. 859
+Reinforcement learning (RL) has shown great success in increasingly complex single-agent environments and two-player turn-based games. However, the real world contains multiple agents, each learning and acting independently to cooperate and compete with other agents. We used a tournament-style evaluation to demonstrate that an agent can achieve human-level performance in a three-dimensional multiplayer first-person video game, Quake III Arena in Capture the Flag mode, using only pixels and game points scored as input. We used a two-tier optimization process in which a population of independent RL agents are trained concurrently from thousands of parallel matches on randomly generated environments. Each agent learns its own internal reward signal and rich representation of the world. These results indicate the great potential of multiagent reinforcement learning for artificial intelligence research.
+Teams of artificial agents compete successfully against humans in the video game Quake III Arena in Capture the Flag mode.
+Teams of artificial agents compete successfully against humans in the video game Quake III Arena in Capture the Flag mode.},
+  number = {6443},
+  journaltitle = {Science},
+  urldate = {2019-06-03},
+  date = {2019-05-31},
+  pages = {859-865},
+  author = {Jaderberg, Max and Czarnecki, Wojciech M. and Dunning, Iain and Marris, Luke and Lever, Guy and Castañeda, Antonio Garcia and Beattie, Charles and Rabinowitz, Neil C. and Morcos, Ari S. and Ruderman, Avraham and Sonnerat, Nicolas and Green, Tim and Deason, Louise and Leibo, Joel Z. and Silver, David and Hassabis, Demis and Kavukcuoglu, Koray and Graepel, Thore},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/BKW8SC9N/Jaderberg et al. - 2019 - Human-level performance in 3D multiplayer games wi.pdf;/home/dimitri/Nextcloud/Zotero/storage/PHLALIVP/859.html},
+  eprinttype = {pmid},
+  eprint = {31147514}
+}
+
+@article{leinsterRethinkingSetTheory2012,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1212.6543},
+  primaryClass = {math},
+  title = {Rethinking Set Theory},
+  url = {http://arxiv.org/abs/1212.6543},
+  abstract = {Mathematicians manipulate sets with confidence almost every day, rarely making mistakes. Few of us, however, could accurately quote what are often referred to as "the" axioms of set theory. This suggests that we all carry around with us, perhaps subconsciously, a reliable body of operating principles for manipulating sets. What if we were to take some of those principles and adopt them as our axioms instead? The message of this article is that this can be done, in a simple, practical way (due to Lawvere). The resulting axioms are ten thoroughly mundane statements about sets. This is an expository article for a general mathematical readership.},
+  urldate = {2019-06-04},
+  date = {2012-12-28},
+  keywords = {Mathematics - Category Theory,Mathematics - Logic},
+  author = {Leinster, Tom},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/9AIKI668/Leinster - 2012 - Rethinking set theory.pdf;/home/dimitri/Nextcloud/Zotero/storage/SBVQGL23/1212.html}
+}
+
+@article{carraraBudgetedReinforcementLearning2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1903.01004},
+  primaryClass = {cs, stat},
+  title = {Budgeted {{Reinforcement Learning}} in {{Continuous State Space}}},
+  url = {http://arxiv.org/abs/1903.01004},
+  abstract = {A Budgeted Markov Decision Process (BMDP) is an extension of a Markov Decision Process to critical applications requiring safety constraints. It relies on a notion of risk implemented in the shape of a cost signal constrained to lie below an - adjustable - threshold. So far, BMDPs could only be solved in the case of finite state spaces with known dynamics. This work extends the state-of-the-art to continuous spaces environments and unknown dynamics. We show that the solution to a BMDP is a fixed point of a novel Budgeted Bellman Optimality operator. This observation allows us to introduce natural extensions of Deep Reinforcement Learning algorithms to address large-scale BMDPs. We validate our approach on two simulated applications: spoken dialogue and autonomous driving.},
+  urldate = {2019-06-05},
+  date = {2019-03-03},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+  author = {Carrara, Nicolas and Leurent, Edouard and Laroche, Romain and Urvoy, Tanguy and Maillard, Odalric-Ambrym and Pietquin, Olivier},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/Q97ZL24I/Carrara et al. - 2019 - Budgeted Reinforcement Learning in Continuous Stat.pdf;/home/dimitri/Nextcloud/Zotero/storage/MKNRYBMH/1903.html}
+}
+
+@article{mehriPretrainingMethodsDialog2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.00414},
+  primaryClass = {cs},
+  title = {Pretraining {{Methods}} for {{Dialog Context Representation Learning}}},
+  url = {http://arxiv.org/abs/1906.00414},
+  abstract = {This paper examines various unsupervised pretraining objectives for learning dialog context representations. Two novel methods of pretraining dialog context encoders are proposed, and a total of four methods are examined. Each pretraining objective is fine-tuned and evaluated on a set of downstream dialog tasks using the MultiWoz dataset and strong performance improvement is observed. Further evaluation shows that our pretraining objectives result in not only better performance, but also better convergence, models that are less data hungry and have better domain generalizability.},
+  urldate = {2019-06-05},
+  date = {2019-06-02},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+  author = {Mehri, Shikib and Razumovskaia, Evgeniia and Zhao, Tiancheng and Eskenazi, Maxine},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/SX4T6RF8/Mehri et al. - 2019 - Pretraining Methods for Dialog Context Representat.pdf;/home/dimitri/Nextcloud/Zotero/storage/GFW2T6ZV/1906.html}
+}
+
+@article{marblestoneIntegrationDeepLearning2016,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1606.03813},
+  primaryClass = {q-bio},
+  title = {Towards an Integration of Deep Learning and Neuroscience},
+  url = {http://arxiv.org/abs/1606.03813},
+  abstract = {Neuroscience has focused on the detailed implementation of computation, studying neural codes, dynamics and circuits. In machine learning, however, artificial neural networks tend to eschew precisely designed codes, dynamics or circuits in favor of brute force optimization of a cost function, often using simple and relatively uniform initial architectures. Two recent developments have emerged within machine learning that create an opportunity to connect these seemingly divergent perspectives. First, structured architectures are used, including dedicated systems for attention, recursion and various forms of short- and long-term memory storage. Second, cost functions and training procedures have become more complex and are varied across layers and over time. Here we think about the brain in terms of these ideas. We hypothesize that (1) the brain optimizes cost functions, (2) these cost functions are diverse and differ across brain locations and over development, and (3) optimization operates within a pre-structured architecture matched to the computational problems posed by behavior. Such a heterogeneously optimized system, enabled by a series of interacting cost functions, serves to make learning data-efficient and precisely targeted to the needs of the organism. We suggest directions by which neuroscience could seek to refine and test these hypotheses.},
+  urldate = {2019-06-05},
+  date = {2016-06-13},
+  keywords = {Quantitative Biology - Neurons and Cognition},
+  author = {Marblestone, Adam and Wayne, Greg and Kording, Konrad},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/F6UGWEZX/Marblestone et al. - 2016 - Towards an integration of deep learning and neuros.pdf;/home/dimitri/Nextcloud/Zotero/storage/SJDK35YE/1606.html}
+}
+
+@inproceedings{barhamMachineLearningSystems2019,
+  location = {{New York, NY, USA}},
+  title = {Machine {{Learning Systems Are Stuck}} in a {{Rut}}},
+  isbn = {978-1-4503-6727-1},
+  url = {http://doi.acm.org/10.1145/3317550.3321441},
+  doi = {10.1145/3317550.3321441},
+  abstract = {In this paper we argue that systems for numerical computing are stuck in a local basin of performance and programmability. Systems researchers are doing an excellent job improving the performance of 5-year-old benchmarks, but gradually making it harder to explore innovative machine learning research ideas. We explain how the evolution of hardware accelerators favors compiler back ends that hyper-optimize large monolithic kernels, show how this reliance on high-performance but inflexible kernels reinforces the dominant style of programming model, and argue these programming abstractions lack expressiveness, maintainability, and modularity; all of which hinders research progress. We conclude by noting promising directions in the field, and advocate steps to advance progress towards high-performance general purpose numerical computing systems on modern accelerators.},
+  booktitle = {Proceedings of the {{Workshop}} on {{Hot Topics}} in {{Operating Systems}}},
+  series = {{{HotOS}} '19},
+  publisher = {{ACM}},
+  urldate = {2019-06-07},
+  date = {2019},
+  pages = {177--183},
+  author = {Barham, Paul and Isard, Michael},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/KK5T5X4X/Barham and Isard - 2019 - Machine Learning Systems Are Stuck in a Rut.pdf},
+  venue = {Bertinoro, Italy}
+}
+
+@article{robertShortHistoryMarkov2011,
+  langid = {english},
+  title = {A {{Short History}} of {{Markov Chain Monte Carlo}}: {{Subjective Recollections}} from {{Incomplete Data}}},
+  volume = {26},
+  issn = {0883-4237},
+  url = {http://projecteuclid.org/euclid.ss/1307626568},
+  doi = {10.1214/10-STS351},
+  shorttitle = {A {{Short History}} of {{Markov Chain Monte Carlo}}},
+  number = {1},
+  journaltitle = {Statistical Science},
+  shortjournal = {Statist. Sci.},
+  urldate = {2019-06-13},
+  date = {2011-02},
+  pages = {102-115},
+  author = {Robert, Christian and Casella, George},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/6QNRSI58/Robert_Casella_2011_A Short History of Markov Chain Monte Carlo.pdf}
+}
+
+@book{highamAccuracyStabilityNumerical2002,
+  location = {{Philadelphia}},
+  title = {Accuracy and Stability of Numerical Algorithms},
+  edition = {2nd ed},
+  isbn = {978-0-89871-521-7},
+  pagetotal = {680},
+  publisher = {{Society for Industrial and Applied Mathematics}},
+  date = {2002},
+  keywords = {Data processing,Computer algorithms,Numerical analysis},
+  author = {Higham, Nicholas J.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/QSQRZ8JC/Higham_2002_Accuracy and stability of numerical algorithms.pdf}
+}
+
+@inproceedings{valiantTheoryLearnable1984,
+  langid = {english},
+  location = {{Not Known}},
+  title = {A Theory of the Learnable},
+  isbn = {978-0-89791-133-7},
+  url = {http://portal.acm.org/citation.cfm?doid=800057.808710},
+  doi = {10.1145/800057.808710},
+  eventtitle = {The Sixteenth Annual {{ACM}} Symposium},
+  booktitle = {Proceedings of the Sixteenth Annual {{ACM}} Symposium on {{Theory}} of Computing  - {{STOC}} '84},
+  publisher = {{ACM Press}},
+  urldate = {2019-06-17},
+  date = {1984},
+  pages = {436-445},
+  author = {Valiant, L. G.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/QVP7SBJ3/Valiant - 1984 - A theory of the learnable.pdf}
+}
+
+@article{fujimotoOffPolicyDeepReinforcement2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1812.02900},
+  primaryClass = {cs, stat},
+  title = {Off-{{Policy Deep Reinforcement Learning}} without {{Exploration}}},
+  url = {http://arxiv.org/abs/1812.02900},
+  abstract = {Many practical applications of reinforcement learning constrain agents to learn from a fixed batch of data which has already been gathered, without offering further possibility for data collection. In this paper, we demonstrate that due to errors introduced by extrapolation, standard off-policy deep reinforcement learning algorithms, such as DQN and DDPG, are incapable of learning with data uncorrelated to the distribution under the current policy, making them ineffective for this fixed batch setting. We introduce a novel class of off-policy algorithms, batch-constrained reinforcement learning, which restricts the action space in order to force the agent towards behaving close to on-policy with respect to a subset of the given data. We present the first continuous control deep reinforcement learning algorithm which can learn effectively from arbitrary, fixed batch data, and empirically demonstrate the quality of its behavior in several tasks.},
+  urldate = {2019-06-18},
+  date = {2018-12-06},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+  author = {Fujimoto, Scott and Meger, David and Precup, Doina},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/EBAYPT9G/Fujimoto et al. - 2018 - Off-Policy Deep Reinforcement Learning without Exp.pdf;/home/dimitri/Nextcloud/Zotero/storage/AGZ8L538/1812.html}
+}
+
+@article{hannaImportanceSamplingPolicy2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1806.01347},
+  primaryClass = {cs, stat},
+  title = {Importance {{Sampling Policy Evaluation}} with an {{Estimated Behavior Policy}}},
+  url = {http://arxiv.org/abs/1806.01347},
+  abstract = {We consider the problem of off-policy evaluation in Markov decision processes. Off-policy evaluation is the task of evaluating the expected return of one policy with data generated by a different, behavior policy. Importance sampling is a technique for off-policy evaluation that re-weights off-policy returns to account for differences in the likelihood of the returns between the two policies. In this paper, we study importance sampling with an estimated behavior policy where the behavior policy estimate comes from the same set of data used to compute the importance sampling estimate. We find that this estimator often lowers the mean squared error of off-policy evaluation compared to importance sampling with the true behavior policy or using a behavior policy that is estimated from a separate data set. Intuitively, estimating the behavior policy in this way corrects for error due to sampling in the action-space. Our empirical results also extend to other popular variants of importance sampling and show that estimating a non-Markovian behavior policy can further lower large-sample mean squared error even when the true behavior policy is Markovian.},
+  urldate = {2019-06-18},
+  date = {2018-06-04},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+  author = {Hanna, Josiah P. and Niekum, Scott and Stone, Peter},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/FFTG8CRE/Hanna et al. - 2018 - Importance Sampling Policy Evaluation with an Esti.pdf;/home/dimitri/Nextcloud/Zotero/storage/D3GYPQ3B/1806.html}
+}
+
+@article{chandakLearningActionRepresentations2019a,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1902.00183},
+  primaryClass = {cs, stat},
+  title = {Learning {{Action Representations}} for {{Reinforcement Learning}}},
+  url = {http://arxiv.org/abs/1902.00183},
+  abstract = {Most model-free reinforcement learning methods leverage state representations (embeddings) for generalization, but either ignore structure in the space of actions or assume the structure is provided a priori. We show how a policy can be decomposed into a component that acts in a low-dimensional space of action representations and a component that transforms these representations into actual actions. These representations improve generalization over large, finite action sets by allowing the agent to infer the outcomes of actions similar to actions already taken. We provide an algorithm to both learn and use action representations and provide conditions for its convergence. The efficacy of the proposed method is demonstrated on large-scale real-world problems.},
+  urldate = {2019-06-18},
+  date = {2019-01-31},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Chandak, Yash and Theocharous, Georgios and Kostas, James and Jordan, Scott and Thomas, Philip S.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/IALS2P6C/Chandak et al. - 2019 - Learning Action Representations for Reinforcement .pdf;/home/dimitri/Nextcloud/Zotero/storage/SC7ZUA3I/1902.html}
+}
+
+@article{gottesmanCombiningParametricNonparametric2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1905.05787},
+  primaryClass = {cs, stat},
+  title = {Combining {{Parametric}} and {{Nonparametric Models}} for {{Off}}-{{Policy Evaluation}}},
+  url = {http://arxiv.org/abs/1905.05787},
+  abstract = {We consider a model-based approach to perform batch off-policy evaluation in reinforcement learning. Our method takes a mixture-of-experts approach to combine parametric and non-parametric models of the environment such that the final value estimate has the least expected error. We do so by first estimating the local accuracy of each model and then using a planner to select which model to use at every time step as to minimize the return error estimate along entire trajectories. Across a variety of domains, our mixture-based approach outperforms the individual models alone as well as state-of-the-art importance sampling-based estimators.},
+  urldate = {2019-06-18},
+  date = {2019-05-14},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Gottesman, Omer and Liu, Yao and Sussex, Scott and Brunskill, Emma and Doshi-Velez, Finale},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/XXS4XBWY/Gottesman et al. - 2019 - Combining Parametric and Nonparametric Models for .pdf;/home/dimitri/Nextcloud/Zotero/storage/YTHD3LMY/1905.html}
+}
+
+@article{leBatchPolicyLearning2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1903.08738},
+  primaryClass = {cs, math, stat},
+  title = {Batch {{Policy Learning}} under {{Constraints}}},
+  url = {http://arxiv.org/abs/1903.08738},
+  abstract = {When learning policies for real-world domains, two important questions arise: (i) how to efficiently use pre-collected off-policy, non-optimal behavior data; and (ii) how to mediate among different competing objectives and constraints. We thus study the problem of batch policy learning under multiple constraints, and offer a systematic solution. We first propose a flexible meta-algorithm that admits any batch reinforcement learning and online learning procedure as subroutines. We then present a specific algorithmic instantiation and provide performance guarantees for the main objective and all constraints. To certify constraint satisfaction, we propose a new and simple method for off-policy policy evaluation (OPE) and derive PAC-style bounds. Our algorithm achieves strong empirical results in different domains, including in a challenging problem of simulated car driving subject to multiple constraints such as lane keeping and smooth driving. We also show experimentally that our OPE method outperforms other popular OPE techniques on a standalone basis, especially in a high-dimensional setting.},
+  urldate = {2019-06-18},
+  date = {2019-03-20},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Mathematics - Optimization and Control},
+  author = {Le, Hoang M. and Voloshin, Cameron and Yue, Yisong},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/ELWQXAKB/Le et al. - 2019 - Batch Policy Learning under Constraints.pdf;/home/dimitri/Nextcloud/Zotero/storage/34RTF36T/1903.html}
+}
+
+@article{yangXLNetGeneralizedAutoregressive2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.08237},
+  primaryClass = {cs},
+  title = {{{XLNet}}: {{Generalized Autoregressive Pretraining}} for {{Language Understanding}}},
+  url = {http://arxiv.org/abs/1906.08237},
+  shorttitle = {{{XLNet}}},
+  abstract = {With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves better performance than pretraining approaches based on autoregressive language modeling. However, relying on corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into pretraining. Empirically, XLNet outperforms BERT on 20 tasks, often by a large margin, and achieves state-of-the-art results on 18 tasks including question answering, natural language inference, sentiment analysis, and document ranking.},
+  urldate = {2019-06-21},
+  date = {2019-06-19},
+  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/FCUWCRCK/Yang et al. - 2019 - XLNet Generalized Autoregressive Pretraining for .pdf;/home/dimitri/Nextcloud/Zotero/storage/FB7CLIH3/1906.html}
+}
+
+@article{coenenVisualizingMeasuringGeometry2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.02715},
+  primaryClass = {cs, stat},
+  title = {Visualizing and {{Measuring}} the {{Geometry}} of {{BERT}}},
+  url = {http://arxiv.org/abs/1906.02715},
+  abstract = {Transformer architectures show significant promise for natural language processing. Given that a single pretrained model can be fine-tuned to perform well on many different tasks, these networks appear to extract generally useful linguistic features. A natural question is how such networks represent this information internally. This paper describes qualitative and quantitative investigations of one particularly effective model, BERT. At a high level, linguistic features seem to be represented in separate semantic and syntactic subspaces. We find evidence of a fine-grained geometric representation of word senses. We also present empirical descriptions of syntactic representations in both attention matrices and individual word embeddings, as well as a mathematical argument to explain the geometry of these representations.},
+  urldate = {2019-06-21},
+  date = {2019-06-06},
+  keywords = {Statistics - Machine Learning,Computer Science - Computation and Language,Computer Science - Machine Learning},
+  author = {Coenen, Andy and Reif, Emily and Yuan, Ann and Kim, Been and Pearce, Adam and Viégas, Fernanda and Wattenberg, Martin},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/7D3K8L65/Coenen et al. - 2019 - Visualizing and Measuring the Geometry of BERT.pdf;/home/dimitri/Nextcloud/Zotero/storage/7WX24LBK/1906.html}
+}
+
+@article{pacchianoWassersteinReinforcementLearning2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.04349},
+  primaryClass = {cs, stat},
+  title = {Wasserstein {{Reinforcement Learning}}},
+  url = {http://arxiv.org/abs/1906.04349},
+  abstract = {We propose behavior-driven optimization via Wasserstein distances (WDs) to improve several classes of state-of-the-art reinforcement learning (RL) algorithms. We show that WD regularizers acting on appropriate policy embeddings efficiently incorporate behavioral characteristics into policy optimization. We demonstrate that they improve Evolution Strategy methods by encouraging more efficient exploration, can be applied in imitation learning and to speed up training of Trust Region Policy Optimization methods. Since the exact computation of WDs is expensive, we develop approximate algorithms based on the combination of different methods: dual formulation of the optimal transport problem, alternating optimization and random feature maps, to effectively replace exact WD computations in the RL tasks considered. We provide theoretical analysis of our algorithms and exhaustive empirical evaluation in a variety of RL settings.},
+  urldate = {2019-06-21},
+  date = {2019-06-10},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Pacchiano, Aldo and Parker-Holder, Jack and Tang, Yunhao and Choromanska, Anna and Choromanski, Krzysztof and Jordan, Michael I.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/PW86XBCQ/Pacchiano et al. - 2019 - Wasserstein Reinforcement Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/4BPXH577/1906.html}
+}
+
+@article{norenzayanCulturalEvolutionProsocial2016,
+  langid = {english},
+  title = {The Cultural Evolution of Prosocial Religions},
+  volume = {39},
+  issn = {0140-525X, 1469-1825},
+  url = {https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/article/cultural-evolution-of-prosocial-religions/01B053B0294890F8CFACFB808FE2A0EF},
+  doi = {10.1017/S0140525X14001356},
+  abstract = {We develop a cultural evolutionary theory of the origins of prosocial religions and apply it to resolve two puzzles in human psychology and cultural history: (1) the rise of large-scale cooperation among strangers and, simultaneously, (2) the spread of prosocial religions in the last 10–12 millennia. We argue that these two developments were importantly linked and mutually energizing. We explain how a package of culturally evolved religious beliefs and practices characterized by increasingly potent, moralizing, supernatural agents, credible displays of faith, and other psychologically active elements conducive to social solidarity promoted high fertility rates and large-scale cooperation with co-religionists, often contributing to success in intergroup competition and conflict. In turn, prosocial religious beliefs and practices spread and aggregated as these successful groups expanded, or were copied by less successful groups. This synthesis is grounded in the idea that although religious beliefs and practices originally arose as nonadaptive by-products of innate cognitive functions, particular cultural variants were then selected for their prosocial effects in a long-term, cultural evolutionary process. This framework (1) reconciles key aspects of the adaptationist and by-product approaches to the origins of religion, (2) explains a variety of empirical observations that have not received adequate attention, and (3) generates novel predictions. Converging lines of evidence drawn from diverse disciplines provide empirical support while at the same time encouraging new research directions and opening up new questions for exploration and debate.},
+  journaltitle = {Behavioral and Brain Sciences},
+  urldate = {2019-06-21},
+  year = {2016/ed},
+  keywords = {belief,cooperation,culture,evolution,prosociality,religion,ritual},
+  author = {Norenzayan, Ara and Shariff, Azim F. and Gervais, Will M. and Willard, Aiyana K. and McNamara, Rita A. and Slingerland, Edward and Henrich, Joseph},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/CQ8HVTSM/Norenzayan et al_2016_The cultural evolution of prosocial religions.pdf;/home/dimitri/Nextcloud/Zotero/storage/C2EHTZPD/01B053B0294890F8CFACFB808FE2A0EF.html}
+}
+
+@article{atranEvolutionReligionHow2010,
+  langid = {english},
+  title = {The {{Evolution}} of {{Religion}}: {{How Cognitive By}}-{{Products}}, {{Adaptive Learning Heuristics}}, {{Ritual Displays}}, and {{Group Competition Generate Deep Commitments}} to {{Prosocial Religions}}},
+  volume = {5},
+  issn = {1555-5542, 1555-5550},
+  url = {http://link.springer.com/10.1162/BIOT_a_00018},
+  doi = {10.1162/BIOT_a_00018},
+  shorttitle = {The {{Evolution}} of {{Religion}}},
+  number = {1},
+  journaltitle = {Biological Theory},
+  shortjournal = {Biol Theory},
+  urldate = {2019-06-21},
+  date = {2010-03},
+  pages = {18-30},
+  author = {Atran, Scott and Henrich, Joseph},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/M4SNZT3C/Atran_Henrich_2010_The Evolution of Religion.pdf}
+}
+
+@article{henrichEvolutionCostlyDisplays2009,
+  langid = {english},
+  title = {The Evolution of Costly Displays, Cooperation and Religion},
+  volume = {30},
+  issn = {10905138},
+  url = {https://linkinghub.elsevier.com/retrieve/pii/S1090513809000245},
+  doi = {10.1016/j.evolhumbehav.2009.03.005},
+  number = {4},
+  journaltitle = {Evolution and Human Behavior},
+  shortjournal = {Evolution and Human Behavior},
+  urldate = {2019-06-21},
+  date = {2009-07},
+  pages = {244-260},
+  author = {Henrich, Joseph},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/EKRNQ87Q/Henrich_2009_The evolution of costly displays, cooperation and religion.pdf}
+}
+
+@article{irpanOffPolicyEvaluationOffPolicy2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.01624},
+  primaryClass = {cs, stat},
+  title = {Off-{{Policy Evaluation}} via {{Off}}-{{Policy Classification}}},
+  url = {http://arxiv.org/abs/1906.01624},
+  abstract = {In this work, we consider the problem of model selection for deep reinforcement learning (RL) in real-world environments. Typically, the performance of deep RL algorithms is evaluated via on-policy interactions with the target environment. However, comparing models in a real-world environment for the purposes of early stopping or hyperparameter tuning is costly and often practically infeasible. This leads us to examine off-policy policy evaluation (OPE) in such settings. We focus on OPE for value-based methods, which are of particular interest in deep RL, with applications like robotics, where off-policy algorithms based on Q-function estimation can often attain better sample complexity than direct policy optimization. Existing OPE metrics either rely on a model of the environment, or the use of importance sampling (IS) to correct for the data being off-policy. However, for high-dimensional observations, such as images, models of the environment can be difficult to fit and value-based methods can make IS hard to use or even ill-conditioned, especially when dealing with continuous action spaces. In this paper, we focus on the specific case of MDPs with continuous action spaces and sparse binary rewards, which is representative of many important real-world applications. We propose an alternative metric that relies on neither models nor IS, by framing OPE as a positive-unlabeled (PU) classification problem with the Q-function as the decision function. We experimentally show that this metric outperforms baselines on a number of tasks. Most importantly, it can reliably predict the relative performance of different policies in a number of generalization scenarios, including the transfer to the real-world of policies trained in simulation for an image-based robotic manipulation task.},
+  urldate = {2019-06-24},
+  date = {2019-06-04},
+  keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+  author = {Irpan, Alex and Rao, Kanishka and Bousmalis, Konstantinos and Harris, Chris and Ibarz, Julian and Levine, Sergey},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/2DE9I4IE/Irpan et al. - 2019 - Off-Policy Evaluation via Off-Policy Classificatio.pdf;/home/dimitri/Nextcloud/Zotero/storage/I3D7GBEL/1906.html}
+}
+
+@article{bradleyWhatAppliedCategory2018,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1809.05923},
+  primaryClass = {math},
+  title = {What Is {{Applied Category Theory}}?},
+  url = {http://arxiv.org/abs/1809.05923},
+  abstract = {This is a collection of introductory, expository notes on applied category theory, inspired by the 2018 Applied Category Theory Workshop, and in these notes we take a leisurely stroll through two themes (functorial semantics and compositionality), two constructions (monoidal categories and decorated cospans) and two examples (chemical reaction networks and natural language processing) within the field.},
+  urldate = {2019-06-25},
+  date = {2018-09-16},
+  keywords = {Mathematics - Category Theory},
+  author = {Bradley, Tai-Danae},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/WCBRMZVX/Bradley - 2018 - What is Applied Category Theory.pdf;/home/dimitri/Nextcloud/Zotero/storage/RS4RK4HS/1809.html}
+}
+
+@book{spivakCategoryTheorySciences2014,
+  location = {{Cambridge, Massachusetts}},
+  title = {Category Theory for the Sciences},
+  isbn = {978-0-262-02813-4},
+  pagetotal = {486},
+  publisher = {{The MIT Press}},
+  date = {2014},
+  keywords = {Categories (Mathematics),Mathematical models,Science},
+  author = {Spivak, David I.},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/HI5KSG9I/Spivak - 2014 - Category theory for the sciences.pdf}
+}
+
+@article{leinsterBasicCategoryTheory2016,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1612.09375},
+  primaryClass = {math},
+  title = {Basic {{Category Theory}}},
+  url = {http://arxiv.org/abs/1612.09375},
+  abstract = {This short introduction to category theory is for readers with relatively little mathematical background. At its heart is the concept of a universal property, important throughout mathematics. After a chapter introducing the basic definitions, separate chapters present three ways of expressing universal properties: via adjoint functors, representable functors, and limits. A final chapter ties the three together. For each new categorical concept, a generous supply of examples is provided, taken from different parts of mathematics. At points where the leap in abstraction is particularly great (such as the Yoneda lemma), the reader will find careful and extensive explanations.},
+  urldate = {2019-06-27},
+  date = {2016-12-29},
+  keywords = {Mathematics - Algebraic Topology,Mathematics - Category Theory,Mathematics - Logic},
+  author = {Leinster, Tom},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/F3NW5R44/Leinster - 2016 - Basic Category Theory.pdf;/home/dimitri/Nextcloud/Zotero/storage/3BNPC6VP/1612.html}
+}
+
+@book{riehlCategoryTheoryContext2017,
+  langid = {english},
+  location = {{United States}},
+  title = {Category Theory in Context},
+  isbn = {978-0-486-82080-4},
+  publisher = {{Dover Publications : Made available through hoopla}},
+  date = {2017},
+  author = {Riehl, Emily},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/H2XLYX3I/Riehl - 2017 - Category theory in context.pdf},
+  note = {OCLC: 1098977147}
+}
+
+@article{maheswaranathanReverseEngineeringRecurrent2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.10720},
+  primaryClass = {cs, stat},
+  title = {Reverse Engineering Recurrent Networks for Sentiment Classification Reveals Line Attractor Dynamics},
+  url = {http://arxiv.org/abs/1906.10720},
+  abstract = {Recurrent neural networks (RNNs) are a widely used tool for modeling sequential data, yet they are often treated as inscrutable black boxes. Given a trained recurrent network, we would like to reverse engineer it--to obtain a quantitative, interpretable description of how it solves a particular task. Even for simple tasks, a detailed understanding of how recurrent networks work, or a prescription for how to develop such an understanding, remains elusive. In this work, we use tools from dynamical systems analysis to reverse engineer recurrent networks trained to perform sentiment classification, a foundational natural language processing task. Given a trained network, we find fixed points of the recurrent dynamics and linearize the nonlinear system around these fixed points. Despite their theoretical capacity to implement complex, high-dimensional computations, we find that trained networks converge to highly interpretable, low-dimensional representations. In particular, the topological structure of the fixed points and corresponding linearized dynamics reveal an approximate line attractor within the RNN, which we can use to quantitatively understand how the RNN solves the sentiment analysis task. Finally, we find this mechanism present across RNN architectures (including LSTMs, GRUs, and vanilla RNNs) trained on multiple datasets, suggesting that our findings are not unique to a particular architecture or dataset. Overall, these results demonstrate that surprisingly universal and human interpretable computations can arise across a range of recurrent networks.},
+  urldate = {2019-06-28},
+  date = {2019-06-25},
+  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
+  author = {Maheswaranathan, Niru and Williams, Alex and Golub, Matthew D. and Ganguli, Surya and Sussillo, David},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/KSL5IDVM/Maheswaranathan et al. - 2019 - Reverse engineering recurrent networks for sentime.pdf;/home/dimitri/Nextcloud/Zotero/storage/LLWPJ6D8/1906.html}
+}
+
+@article{yurochkinHierarchicalOptimalTransport2019,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1906.10827},
+  primaryClass = {cs, stat},
+  title = {Hierarchical {{Optimal Transport}} for {{Document Representation}}},
+  url = {http://arxiv.org/abs/1906.10827},
+  abstract = {The ability to measure similarity between documents enables intelligent summarization and analysis of large corpora. Past distances between documents suffer from either an inability to incorporate semantic similarities between words or from scalability issues. As an alternative, we introduce hierarchical optimal transport as a meta-distance between documents, where documents are modeled as distributions over topics, which themselves are modeled as distributions over words. We then solve an optimal transport problem on the smaller topic space to compute a similarity score. We give conditions on the topics under which this construction defines a distance, and we relate it to the word mover's distance. We evaluate our technique for \$k\$-NN classification and show better interpretability and scalability with comparable performance to current methods at a fraction of the cost.},
+  urldate = {2019-06-28},
+  date = {2019-06-25},
+  keywords = {Statistics - Machine Learning,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Information Retrieval},
+  author = {Yurochkin, Mikhail and Claici, Sebastian and Chien, Edward and Mirzazadeh, Farzaneh and Solomon, Justin},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/EJGKCIUG/Yurochkin et al. - 2019 - Hierarchical Optimal Transport for Document Repres.pdf;/home/dimitri/Nextcloud/Zotero/storage/EC9XIVU7/1906.html}
+}
+
+@article{betancourtConceptualIntroductionHamiltonian2017,
+  archivePrefix = {arXiv},
+  eprinttype = {arxiv},
+  eprint = {1701.02434},
+  primaryClass = {stat},
+  title = {A {{Conceptual Introduction}} to {{Hamiltonian Monte Carlo}}},
+  url = {http://arxiv.org/abs/1701.02434},
+  abstract = {Hamiltonian Monte Carlo has proven a remarkable empirical success, but only recently have we begun to develop a rigorous understanding of why it performs so well on difficult problems and how it is best applied in practice. Unfortunately, that understanding is confined within the mathematics of differential geometry which has limited its dissemination, especially to the applied communities for which it is particularly important. In this review I provide a comprehensive conceptual account of these theoretical foundations, focusing on developing a principled intuition behind the method and its optimal implementations rather of any exhaustive rigor. Whether a practitioner or a statistician, the dedicated reader will acquire a solid grasp of how Hamiltonian Monte Carlo works, when it succeeds, and, perhaps most importantly, when it fails.},
+  urldate = {2019-06-28},
+  date = {2017-01-09},
+  keywords = {Statistics - Methodology},
+  author = {Betancourt, Michael},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/8ZS8KLKS/Betancourt_2017_A Conceptual Introduction to Hamiltonian Monte Carlo.pdf;/home/dimitri/Nextcloud/Zotero/storage/UG4K45FI/1701.html}
+}
+
+@article{diaconisMarkovChainMonte2008,
+  langid = {english},
+  title = {The {{Markov}} Chain {{Monte Carlo}} Revolution},
+  volume = {46},
+  issn = {0273-0979},
+  url = {http://www.ams.org/journal-getitem?pii=S0273-0979-08-01238-X},
+  doi = {10.1090/S0273-0979-08-01238-X},
+  number = {2},
+  journaltitle = {Bulletin of the American Mathematical Society},
+  shortjournal = {Bull. Amer. Math. Soc.},
+  urldate = {2019-06-28},
+  date = {2008-11-20},
+  pages = {179-205},
+  author = {Diaconis, Persi},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/B34NLYEQ/Diaconis_2008_The Markov chain Monte Carlo revolution.pdf}
+}
+
+@thesis{rainforthAutomatingInferenceLearning2017,
+  langid = {english},
+  title = {Automating Inference, Learning, and Design Using Probabilistic Programming},
+  url = {https://ora.ox.ac.uk/objects/uuid:e276f3b4-ff1d-44bf-9d67-013f68ce81f0#citeForm},
+  abstract = {Imagine a world where computational simulations can be inverted as easily as running them forwards, where data can be used to refine models automatically, and where the only expertise one needs to carry out powerful statistical analysis is a basic proficiency in scientific coding. Creating such a world is the ambitious long-term aim of probabilistic programming. The bottleneck for improving the probabilistic models, or simulators, used throughout the quantitative sciences, is often not an ability to devise better models conceptually, but a lack of expertise, time, or resources to realize such innovations. Probabilistic programming systems (PPSs) help alleviate this bottleneck by providing an expressive and accessible modeling framework, then automating the required computation to draw inferences from the model, for example finding the model parameters likely to give rise to a certain output. By decoupling model specification and inference, PPSs streamline the process of developing and drawing inferences from new models, while opening up powerful statistical methods to non-experts. Many systems further provide the flexibility to write new and exciting models which would be hard, or even impossible, to convey using conventional statistical frameworks. The central goal of this thesis is to improve and extend PPSs. In particular, we will make advancements to the underlying inference engines and increase the range of problems which can be tackled. For example, we will extend PPSs to a mixed inference-optimization framework, thereby providing automation of tasks such as model learning and engineering design. Meanwhile, we make inroads into constructing systems for automating adaptive sequential design problems, providing potential applications across the sciences. Furthermore, the contributions of the work reach far beyond probabilistic programming, as achieving our goal will require us to make advancements in a number of related fields such as particle Markov chain Monte Carlo methods, Bayesian optimization, and Monte Carlo fundamentals.},
+  institution = {{University of Oxford}},
+  type = {http://purl.org/dc/dcmitype/Text},
+  urldate = {2019-06-28},
+  date = {2017},
+  author = {Rainforth, Thomas William Gamlen},
+  file = {/home/dimitri/Nextcloud/Zotero/storage/4QUDLF5N/Rainforth_2017_Automating inference, learning, and design using probabilistic programming.pdf;/home/dimitri/Nextcloud/Zotero/storage/E2GS2T7K/uuide276f3b4-ff1d-44bf-9d67-013f68ce81f0.html}
+}
+