[ ( "#active-learning" , ( "" , "" , "" , "" , "" ) ) , ( "#add-more-disk-space" , ( "" , "" , "" , "" , "" ) ) , ( "#advantages-disadvantages" , ( "" , "" , "" , "" , "" ) ) , ( "#appendix" , ( "" , "" , "" , "" , "" ) ) , ( "#august" , ( "" , "" , "" , "" , "" ) ) , ( "#b-hyperparameters" , ( "" , "" , "" , "" , "" ) ) , ( "#b-samples" , ( "" , "" , "" , "" , "" ) ) , ( "#b-training" , ( "" , "" , "" , "" , "" ) ) , ( "#bradley-terry-preference-learning" , ( "" , "" , "" , "" , "" ) ) , ( "#cleaning-project-gutenberg-contemporary-poetry" , ( "" , "" , "" , "" , "" ) ) , ( "#create-a-swapfile" , ( "" , "" , "" , "" , "" ) ) , ( "#credits" , ( "" , "" , "" , "" , "" ) ) , ( "#data-the-project-gutenberg-poetry-corpus" , ( "" , "" , "" , "" , "" ) ) , ( "#essay-on-criticism" , ( "" , "" , "" , "" , "" ) ) , ( "#external-links" , ( "" , "" , "" , "" , "" ) ) , ( "#famous-first-lines" , ( "" , "" , "" , "" , "" ) ) , ( "#full-bradley-terry-training" , ( "" , "" , "" , "" , "" ) ) , ( "#gcp" , ( "" , "" , "" , "" , "" ) ) , ( "#google-colab" , ( "" , "" , "" , "" , "" ) ) , ( "#gpt-2-1.5b" , ( "" , "" , "" , "" , "" ) ) , ( "#gpt-2-345m" , ( "" , "" , "" , "" , "" ) ) , ( "#gpt-2-poetry-prefix-completions" , ( "" , "" , "" , "" , "" ) ) , ( "#gpt-2-poetry-prefix-samples" , ( "" , "" , "" , "" , "" ) ) , ( "#gpt-2-poetry-samples" , ( "" , "" , "" , "" , "" ) ) , ( "#gpt-2-small-generating-poetry" , ( "" , "" , "" , "" , "" ) ) , ( "#gpu-failures" , ( "" , "" , "" , "" , "" ) ) , ( "#hamlet-william-shakespeare" , ( "" , "" , "" , "" , "" ) ) , ( "#haproxy" , ( "" , "" , "" , "" , "" ) ) , ( "#haproxy-deprecated-use-iptables" , ( "" , "" , "" , "" , "" ) ) , ( "#howl" , ( "" , "" , "" , "" , "" ) ) , ( "#improvements" , ( "" , "" , "" , "" , "" ) ) , ( "#initial-setup" , ( "" , "" , "" , "" , "" ) ) , ( "#invictus-william-ernest-henley" , ( "" , "" , "" , "" , "" ) ) , ( "#iptables" , ( "" , "" , "" , "" , "" ) ) , ( "#is-preference-learning-a-bradley-terry-model" , ( "" , "" , "" , "" , "" ) ) , ( "#jabberwocky-lewis-carroll" , ( "" , "" , "" , "" , "" ) ) , ( "#license" , ( "" , "" , "" , "" , "" ) ) , ( "#loss-1.3" , ( "" , "" , "" , "" , "" ) ) , ( "#loss-1.6" , ( "" , "" , "" , "" , "" ) ) , ( "#loss-2.6" , ( "" , "" , "" , "" , "" ) ) , ( "#measuring-network-performance-with-iperf" , ( "" , "" , "" , "" , "" ) ) , ( "#memory-box" , ( "" , "" , "" , "" , "" ) ) , ( "#notes" , ( "" , "" , "" , "" , "" ) ) , ( "#overall" , ( "" , "" , "" , "" , "" ) ) , ( "#ozymandias" , ( "" , "" , "" , "" , "" ) ) , ( "#pioneers-o-pioneers-walt-whitman" , ( "" , "" , "" , "" , "" ) ) , ( "#random-samples" , ( "" , "" , "" , "" , "" ) ) , ( "#references" , ( "" , "" , "" , "" , "" ) ) , ( "#romeo-juliet-william-shakespeare" , ( "" , "" , "" , "" , "" ) ) , ( "#router-vm-creation" , ( "" , "" , "" , "" , "" ) ) , ( "#router-vm-initial-setup" , ( "" , "" , "" , "" , "" ) ) , ( "#router-vm-setup" , ( "" , "" , "" , "" , "" ) ) , ( "#sailing-to-byzantium-yeats" , ( "" , "" , "" , "" , "" ) ) , ( "#samples" , ( "" , "" , "" , "" , "" ) ) , ( "#section" , ( "" , "" , "" , "" , "" ) ) , ( "#set-up-gpt-2" , ( "" , "" , "" , "" , "" ) ) , ( "#set-up-scrap-utilities" , ( "" , "" , "" , "" , "" ) ) , ( "#set-up-stylegan2" , ( "" , "" , "" , "" , "" ) ) , ( "#set-up-tensorflow-1.15" , ( "" , "" , "" , "" , "" ) ) , ( "#shawwns-ssh-pubkey" , ( "" , "" , "" , "" , "" ) ) , ( "#site" , ( "" , "" , "" , "" , "" ) ) , ( "#sonnet-29-shakespeare" , ( "" , "" , "" , "" , "" ) ) , ( "#swarm-training" , ( "" , "" , "" , "" , "" ) ) , ( "#swarm-vm-creation" , ( "" , "" , "" , "" , "" ) ) , ( "#swarm-vm-initial-setup" , ( "" , "" , "" , "" , "" ) ) , ( "#swarm-vm-setup" , ( "" , "" , "" , "" , "" ) ) , ( "#tao-te-ching" , ( "" , "" , "" , "" , "" ) ) , ( "#the-love-song-of-j.-alfred-prufrock-t.s.-eliot" , ( "" , "" , "" , "" , "" ) ) , ( "#top-supporters" , ( "" , "" , "" , "" , "" ) ) , ( "#training" , ( "" , "" , "" , "" , "" ) ) , ( "#training-gpt-2-poetry" , ( "" , "" , "" , "" , "" ) ) , ( "#training-gpt-2-poetry-prefix" , ( "" , "" , "" , "" , "" ) ) , ( "#training-gpt-2-small-to-generate-poetry" , ( "" , "" , "" , "" , "" ) ) , ( "#training-samples" , ( "" , "" , "" , "" , "" ) ) , ( "#training-samples-1" , ( "" , "" , "" , "" , "" ) ) , ( "#ulysses-lord-alfred-tennyson" , ( "" , "" , "" , "" , "" ) ) , ( "#unconditional-samples" , ( "" , "" , "" , "" , "" ) ) , ( "#vm-setup" , ( "" , "" , "" , "" , "" ) ) , ( "/About" , ( "About This Website" , "shawwn" , "shawwn" , "" , "Meta page describing shawwn.com; copyright license" ) ) , ( "/Faces#fn27" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/GPT-2-music" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/GPT-2-preference-learning" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/GPT-2-preference-learning#bradley-terry-preference-learning" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/GPT-2-preference-learning#optimization-by-backprop-not-blackbox" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/Links" , ( "Links" , "shawwn" , "shawwn" , "" , "Who am I online & what have I done? - Contact information; sites I use; things I've worked on" ) ) , ( "/RNN-metadata" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/Resorter" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/TWDE#text" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/TWDNE" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/Tea#water-experiment" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/Tool-AI" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-03-06-gpt2-poetry-1000samples.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-03-06-gpt2-poetry-prefix-1000samples.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-03-16-gpt2-poetry-prefix-jabberwocky-100samples.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-05-13-gpt2-poetry-345m-5000samples.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-05-24-gpt2-poetry-yeatssecondcoming-500completions.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-07-19-taotehching-ch1-1ksamples.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-07-21-taotehching-all-1ksamples.txt" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-07-22-gpt2-345m-taotehching-all-ch181.tar.xz" , ( "" , "shawwn" , "shawwn" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/2019-12-16-gpt21.5b-poetry-samples-topp080.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-03-06-gpt2-poetry-1000samples.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-03-06-gpt2-poetry-prefix-1000samples.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-03-16-gpt2-poetry-prefix-jabberwocky-100samples.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-05-13-gpt2-poetry-345m-5000samples.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-05-24-gpt2-poetry-yeatssecondcoming-500completions.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-07-19-taotehching-ch1-1ksamples.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-07-21-taotehching-all-1ksamples.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-07-22-gpt2-345m-taotehching-all-ch181.tar.xz" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-12-15-gpt21.5b-poetry-samples-topp090.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/ai/poetry/2019-12-18-gpt21.5b-poetry-samples-topp080.txt" , ( "" , "shawwn" , "" , "" , "404 Not Found Error: no page by this name!" ) ) , ( "/docs/borges/1937-borges-raymondllullsthinkingmachine.pdf" , ( "" , "" , "" , "" , "" ) ) , ( "http://antinegationism.tumblr.com/post/182901133106/an-eternal-howl" , ( "" , "" , "" , "" , "" ) ) , ( "http://codyraskin.com/research/?p=135" , ( "" , "" , "" , "" , "" ) ) , ( "http://creativecommons.org/about/cc0" , ( "" , "" , "" , "" , "" ) ) , ( "http://nlp.seas.harvard.edu/2018/04/03/attention.html" , ( "" , "" , "" , "" , "" ) ) , ( "http://papers.nips.cc/paper/6220-memory-efficient-backpropagation-through-time" , ( "" , "" , "" , "" , "" ) ) , ( "http://sevensecularsermons.org/on-the-significance-of-gwerns-poem-generator/" , ( "" , "" , "" , "" , "" ) ) , ( "http://sfbay-anarchists.org/wp-content/uploads/2012/05/Trurls-Electronic-Bard.pdf" , ( "" , "" , "" , "" , "" ) ) , ( "http://wiki.obormot.net" , ( "" , "" , "" , "" , "" ) ) , ( "http://www.aclweb.org/anthology/D15-1002" , ( "" , "" , "" , "" , "" ) ) , ( "http://www.peterbloem.nl/blog/transformers" , ( "" , "" , "" , "" , "" ) ) , ( "http://www.ubu.com/concept/racter.html" , ( "" , "" , "" , "" , "" ) ) , ( "http://yudhanjaya.com/2019/04/the-poetry-machine/" , ( "" , "" , "" , "" , "" ) ) , ( "https://ai.googleblog.com/2017/08/transformer-novel-neural-network.html" , ( "" , "" , "" , "" , "" ) ) , ( "https://arxiv.org/abs/1106.5730" , ( "HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic Gradient\n Descent" , "Feng Niu, Benjamin Recht, Christopher Re, Stephen J. Wright" , "2020-01-25" , "" , "Stochastic Gradient Descent (SGD) is a popular algorithm that can achieve state-of-the-art performance on a variety of machine learning tasks. Several researchers have recently proposed schemes to parallelize SGD, but all require performance-destroying memory locking and synchronization. This work aims to show using novel theoretical analysis, algorithms, and implementation that SGD can be implemented without any locking. We present an update scheme called HOGWILD! which allows processors access to shared memory with the possibility of overwriting each other's work. We show that when the associated optimization problem is sparse, meaning most gradient updates only modify small parts of the decision variable, then HOGWILD! achieves a nearly optimal rate of convergence. We demonstrate experimentally that HOGWILD! outperforms alternative schemes that use locking by an order of magnitude." ) ) , ( "https://arxiv.org/abs/1506.01186" , ( "Cyclical Learning Rates for Training Neural Networks" , "Leslie N. Smith" , "2020-01-25" , "" , "It is known that the learning rate is the most important hyper-parameter to tune for training deep neural networks. This paper describes a new method for setting the learning rate, named cyclical learning rates, which practically eliminates the need to experimentally find the best values and schedule for the global learning rates. Instead of monotonically decreasing the learning rate, this method lets the learning rate cyclically vary between reasonable boundary values. Training with cyclical learning rates instead of fixed values achieves improved classification accuracy without a need to tune and often in fewer iterations. This paper also describes a simple way to estimate \"reasonable bounds\" -- linearly increasing the learning rate of the network for a few epochs. In addition, cyclical learning rates are demonstrated on the CIFAR-10 and CIFAR-100 datasets with ResNets, Stochastic Depth networks, and DenseNets, and the ImageNet dataset with the AlexNet and GoogLeNet architectures. These are practical tools for everyone who trains neural networks." ) ) , ( "https://arxiv.org/abs/1604.06174" , ( "Training Deep Nets with Sublinear Memory Cost" , "Tianqi Chen, Bing Xu, Chiyuan Zhang, Carlos Guestrin" , "2019-08-27" , "" , "We propose a systematic approach to reduce the memory consumption of deep neural network training. Specifically, we design an algorithm that costs O(sqrt(n)) memory to train a n layer network, with only the computational cost of an extra forward pass per mini-batch. As many of the state-of-the-art models hit the upper bound of the GPU memory, our algorithm allows deeper and more complex models to be explored, and helps advance the innovations in deep learning research. We focus on reducing the memory cost to store the intermediate feature maps and gradients during training. Computation graph analysis is used for automatic in-place operation and memory sharing optimizations. We show that it is possible to trade computation for memory - giving a more memory efficient training algorithm with a little extra computation cost. In the extreme case, our analysis also shows that the memory consumption can be reduced to O(log n) with as little as O(n log n) extra cost for forward computation. Our experiments show that we can reduce the memory cost of a 1,000-layer deep residual network from 48G to 7G with only 30 percent additional running time cost on ImageNet problems. Similarly, significant memory cost reduction is observed in training complex recurrent neural networks on very long sequences." ) ) , ( "https://arxiv.org/abs/1608.03983" , ( "SGDR: Stochastic Gradient Descent with Warm Restarts" , "Ilya Loshchilov, Frank Hutter" , "2020-01-25" , "" , "Restart techniques are common in gradient-free optimization to deal with multimodal functions. Partial warm restarts are also gaining popularity in gradient-based optimization to improve the rate of convergence in accelerated gradient schemes to deal with ill-conditioned functions. In this paper, we propose a simple warm restart technique for stochastic gradient descent to improve its anytime performance when training deep neural networks. We empirically study its performance on the CIFAR-10 and CIFAR-100 datasets, where we demonstrate new state-of-the-art results at 3.14% and 16.21%, respectively. We also demonstrate its advantages on a dataset of EEG recordings and on a downsampled version of the ImageNet dataset. Our source code is available at https://github.com/loshchil/SGDR" ) ) , ( "https://arxiv.org/abs/1610.01945" , ( "Connecting Generative Adversarial Networks and Actor-Critic Methods" , "David Pfau, Oriol Vinyals" , "2019-08-27" , "" , "Both generative adversarial networks (GAN) in unsupervised learning and actor-critic methods in reinforcement learning (RL) have gained a reputation for being difficult to optimize. Practitioners in both fields have amassed a large number of strategies to mitigate these instabilities and improve training. Here we show that GANs can be viewed as actor-critic methods in an environment where the actor cannot affect the reward. We review the strategies for stabilizing training for each class of models, both those that generalize between the two and those that are particular to that model. We also review a number of extensions to GANs and RL algorithms with even more complicated information flow. We hope that by highlighting this formal connection we will encourage both GAN and RL communities to develop general, scalable, and stable algorithms for multilevel optimization with deep networks, and to draw inspiration across communities." ) ) , ( "https://arxiv.org/abs/1611.03852" , ( "A Connection between Generative Adversarial Networks, Inverse\n Reinforcement Learning, and Energy-Based Models" , "Chelsea Finn, Paul Christiano, Pieter Abbeel, Sergey Levine" , "2019-08-27" , "" , "Generative adversarial networks (GANs) are a recently proposed class of generative models in which a generator is trained to optimize a cost function that is being simultaneously learned by a discriminator. While the idea of learning cost functions is relatively new to the field of generative modeling, learning costs has long been studied in control and reinforcement learning (RL) domains, typically for imitation learning from demonstrations. In these fields, learning cost function underlying observed behavior is known as inverse reinforcement learning (IRL) or inverse optimal control. While at first the connection between cost learning in RL and cost learning in generative modeling may appear to be a superficial one, we show in this paper that certain IRL methods are in fact mathematically equivalent to GANs. In particular, we demonstrate an equivalence between a sample-based algorithm for maximum entropy IRL and a GAN in which the generator's density can be evaluated and is provided as an additional input to the discriminator. Interestingly, maximum entropy IRL is a special case of an energy-based model. We discuss the interpretation of GANs as an algorithm for training energy-based models, and relate this interpretation to other recent work that seeks to connect GANs and EBMs. By formally highlighting the connection between GANs, IRL, and EBMs, we hope that researchers in all three communities can better identify and apply transferable ideas from one domain to another, particularly for developing more stable and scalable algorithms: a major challenge in all three domains." ) ) , ( "https://arxiv.org/abs/1706.03741" , ( "Deep reinforcement learning from human preferences" , "Paul Christiano, Jan Leike, Tom B. Brown, Miljan Martic, Shane Legg, Dario Amodei" , "2019-08-27" , "" , "For sophisticated reinforcement learning (RL) systems to interact usefully with real-world environments, we need to communicate complex goals to these systems. In this work, we explore goals defined in terms of (non-expert) human preferences between pairs of trajectory segments. We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment. This reduces the cost of human oversight far enough that it can be practically applied to state-of-the-art RL systems. To demonstrate the flexibility of our approach, we show that we can successfully train complex novel behaviors with about an hour of human time. These behaviors and environments are considerably more complex than any that have been previously learned from human feedback." ) ) , ( "https://arxiv.org/abs/1706.03762" , ( "Attention Is All You Need" , "Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin" , "2019-08-27" , "" , "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data." ) ) , ( "https://arxiv.org/abs/1706.03799" , ( "Verb Physics: Relative Physical Knowledge of Actions and Objects" , "Maxwell Forbes, Yejin Choi" , "2019-08-27" , "" , "Learning commonsense knowledge from natural language text is nontrivial due to reporting bias: people rarely state the obvious, e.g., \"My house is bigger than me.\" However, while rarely stated explicitly, this trivial everyday knowledge does influence the way people talk about the world, which provides indirect clues to reason about the world. For example, a statement like, \"Tyler entered his house\" implies that his house is bigger than Tyler. In this paper, we present an approach to infer relative physical knowledge of actions and objects along five dimensions (e.g., size, weight, and strength) from unstructured natural language text. We frame knowledge acquisition as joint inference over two closely related problems: learning (1) relative physical knowledge of object pairs and (2) physical implications of actions when applied to those object pairs. Empirical results demonstrate that it is possible to extract knowledge of actions and objects from language and that joint inference over different types of knowledge improves performance." ) ) , ( "https://arxiv.org/abs/1706.07068" , ( "CAN: Creative Adversarial Networks, Generating \"Art\" by Learning About\n Styles and Deviating from Style Norms" , "Ahmed Elgammal, Bingchen Liu, Mohamed Elhoseiny, Marian Mazzone" , "2019-08-27" , "" , "We propose a new system for generating art. The system generates art by looking at art and learning about style; and becomes creative by increasing the arousal potential of the generated art by deviating from the learned styles. We build over Generative Adversarial Networks (GAN), which have shown the ability to learn to generate novel images simulating a given distribution. We argue that such networks are limited in their ability to generate creative products in their original design. We propose modifications to its objective to make it capable of generating creative art by maximizing deviation from established styles and minimizing deviation from art distribution. We conducted experiments to compare the response of human subjects to the generated art with their response to art created by artists. The results show that human subjects could not distinguish art generated by the proposed system from art generated by contemporary artists and shown in top art fairs. Human subjects even rated the generated images higher on various scales." ) ) , ( "https://arxiv.org/abs/1802.01241" , ( "Semantic projection: recovering human knowledge of multiple, distinct\n object features from word embeddings" , "Gabriel Grand, Idan Asher Blank, Francisco Pereira, Evelina Fedorenko" , "2019-08-27" , "" , "The words of a language reflect the structure of the human mind, allowing us to transmit thoughts between individuals. However, language can represent only a subset of our rich and detailed cognitive architecture. Here, we ask what kinds of common knowledge (semantic memory) are captured by word meanings (lexical semantics). We examine a prominent computational model that represents words as vectors in a multidimensional space, such that proximity between word-vectors approximates semantic relatedness. Because related words appear in similar contexts, such spaces - called \"word embeddings\" - can be learned from patterns of lexical co-occurrences in natural language. Despite their popularity, a fundamental concern about word embeddings is that they appear to be semantically \"rigid\": inter-word proximity captures only overall similarity, yet human judgments about object similarities are highly context-dependent and involve multiple, distinct semantic features. For example, dolphins and alligators appear similar in size, but differ in intelligence and aggressiveness. Could such context-dependent relationships be recovered from word embeddings? To address this issue, we introduce a powerful, domain-general solution: \"semantic projection\" of word-vectors onto lines that represent various object features, like size (the line extending from the word \"small\" to \"big\"), intelligence (from \"dumb\" to \"smart\"), or danger (from \"safe\" to \"dangerous\"). This method, which is intuitively analogous to placing objects \"on a mental scale\" between two extremes, recovers human judgments across a range of object categories and properties. We thus show that word embeddings inherit a wealth of common knowledge from word co-occurrence statistics and can be flexibly manipulated to express context-dependent meanings." ) ) , ( "https://arxiv.org/abs/1803.02155" , ( "Self-Attention with Relative Position Representations" , "Peter Shaw, Jakob Uszkoreit, Ashish Vaswani" , "2019-08-27" , "" , "Relying entirely on an attention mechanism, the Transformer introduced by Vaswani et al. (2017) achieves state-of-the-art results for machine translation. In contrast to recurrent and convolutional neural networks, it does not explicitly model relative or absolute position information in its structure. Instead, it requires adding representations of absolute positions to its inputs. In this work we present an alternative approach, extending the self-attention mechanism to efficiently consider representations of the relative positions, or distances between sequence elements. On the WMT 2014 English-to-German and English-to-French translation tasks, this approach yields improvements of 1.3 BLEU and 0.3 BLEU over absolute position representations, respectively. Notably, we observe that combining relative and absolute position representations yields no further improvement in translation quality. We describe an efficient implementation of our method and cast it as an instance of relation-aware self-attention mechanisms that can generalize to arbitrary graph-labeled inputs." ) ) , ( "https://arxiv.org/abs/1803.05407" , ( "Averaging Weights Leads to Wider Optima and Better Generalization" , "Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson" , "2020-01-25" , "" , "Deep neural networks are typically trained by optimizing a loss function with an SGD variant, in conjunction with a decaying learning rate, until convergence. We show that simple averaging of multiple points along the trajectory of SGD, with a cyclical or constant learning rate, leads to better generalization than conventional training. We also show that this Stochastic Weight Averaging (SWA) procedure finds much flatter solutions than SGD, and approximates the recent Fast Geometric Ensembling (FGE) approach with a single model. Using SWA we achieve notable improvement in test accuracy over conventional SGD training on a range of state-of-the-art residual networks, PyramidNets, DenseNets, and Shake-Shake networks on CIFAR-10, CIFAR-100, and ImageNet. In short, SWA is extremely easy to implement, improves generalization, and has almost no computational overhead." ) ) , ( "https://arxiv.org/abs/1804.04235" , ( "Adafactor: Adaptive Learning Rates with Sublinear Memory Cost" , "Noam Shazeer, Mitchell Stern" , "2020-01-25" , "" , "In several recently proposed stochastic optimization methods (e.g. RMSProp, Adam, Adadelta), parameter updates are scaled by the inverse square roots of exponential moving averages of squared past gradients. Maintaining these per-parameter second-moment estimators requires memory equal to the number of parameters. For the case of neural network weight matrices, we propose maintaining only the per-row and per-column sums of these moving averages, and estimating the per-parameter second moments based on these sums. We demonstrate empirically that this method produces similar results to the baseline. Secondly, we show that adaptive methods can produce larger-than-desired updates when the decay rate of the second moment accumulator is too slow. We propose update clipping and a gradually increasing decay rate scheme as remedies. Combining these methods and dropping momentum, we achieve comparable results to the published Adam regime in training the Transformer model on the WMT 2014 English-German machine translation task, while using very little auxiliary storage in the optimizer. Finally, we propose scaling the parameter updates based on the scale of the parameters themselves." ) ) , ( "https://arxiv.org/abs/1808.04444" , ( "Character-Level Language Modeling with Deeper Self-Attention" , "Rami Al-Rfou, Dokook Choe, Noah Constant, Mandy Guo, Llion Jones" , "2019-08-27" , "" , "LSTMs and other RNN variants have shown strong performance on character-level language modeling. These models are typically trained using truncated backpropagation through time, and it is common to assume that their success stems from their ability to remember long-term contexts. In this paper, we show that a deep (64-layer) transformer model with fixed context outperforms RNN variants by a large margin, achieving state of the art on two popular benchmarks: 1.13 bits per character on text8 and 1.06 on enwik8. To get good results at this depth, we show that it is important to add auxiliary losses, both at intermediate network layers and intermediate sequence positions." ) ) , ( "https://arxiv.org/abs/1901.02860" , ( "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" , "Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov" , "2019-08-27" , "" , "Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably coherent, novel text articles with thousands of tokens. Our code, pretrained models, and hyperparameters are available in both Tensorflow and PyTorch." ) ) , ( "https://arxiv.org/abs/1904.09751" , ( "The Curious Case of Neural Text Degeneration" , "Ari Holtzman, Jan Buys, Maxwell Forbes, Yejin Choi" , "2019-08-27" , "" , "Despite considerable advancements with deep neural language models, the enigma of neural text degeneration persists when these models are tested as text generators. The counter-intuitive empirical observation is that even though the use of likelihood as training objective leads to high quality models for a broad range of language understanding tasks, using likelihood as a decoding objective leads to text that is bland and strangely repetitive. In this paper, we reveal surprising distributional differences between human text and machine text. In addition, we find that decoding strategies alone can dramatically effect the quality of machine text, even when generated from exactly the same neural language model. Our findings motivate Nucleus Sampling, a simple but effective method to draw the best out of neural generation. By sampling text from the dynamic nucleus of the probability distribution, which allows for diversity while effectively truncating the less reliable tail of the distribution, the resulting text better demonstrates the quality of human text, yielding enhanced diversity without sacrificing fluency and coherence." ) ) , ( "https://arxiv.org/abs/1904.10509" , ( "Generating Long Sequences with Sparse Transformers" , "Rewon Child, Scott Gray, Alec Radford, Ilya Sutskever" , "2019-08-27" , "" , "Transformers are powerful sequence models, but require time and memory that grows quadratically with the sequence length. In this paper we introduce sparse factorizations of the attention matrix which reduce this to $O(n \\sqrt{n})$. We also introduce a) a variation on architecture and initialization to train deeper networks, b) the recomputation of attention matrices to save memory, and c) fast attention kernels for training. We call networks with these changes Sparse Transformers, and show they can model sequences tens of thousands of timesteps long using hundreds of layers. We use the same architecture to model images, audio, and text from raw bytes, setting a new state of the art for density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate unconditional samples that demonstrate global coherence and great diversity, and show it is possible in principle to use self-attention to model sequences of length one million or more." ) ) , ( "https://arxiv.org/abs/1904.10509#openai" , ( "Generating Long Sequences with Sparse Transformers" , "Rewon Child, Scott Gray, Alec Radford, Ilya Sutskever" , "2020-01-25" , "" , "Transformers are powerful sequence models, but require time and memory that grows quadratically with the sequence length. In this paper we introduce sparse factorizations of the attention matrix which reduce this to $O(n \\sqrt{n})$. We also introduce a) a variation on architecture and initialization to train deeper networks, b) the recomputation of attention matrices to save memory, and c) fast attention kernels for training. We call networks with these changes Sparse Transformers, and show they can model sequences tens of thousands of timesteps long using hundreds of layers. We use the same architecture to model images, audio, and text from raw bytes, setting a new state of the art for density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate unconditional samples that demonstrate global coherence and great diversity, and show it is possible in principle to use self-attention to model sequences of length one million or more." ) ) , ( "https://arxiv.org/abs/1905.02175" , ( "Adversarial Examples Are Not Bugs, They Are Features" , "Andrew Ilyas, Shibani Santurkar, Dimitris Tsipras, Logan Engstrom, Brandon Tran, Aleksander Madry" , "2019-08-27" , "" , "Adversarial examples have attracted significant attention in machine learning, but the reasons for their existence and pervasiveness remain unclear. We demonstrate that adversarial examples can be directly attributed to the presence of non-robust features: features derived from patterns in the data distribution that are highly predictive, yet brittle and incomprehensible to humans. After capturing these features within a theoretical framework, we establish their widespread existence in standard datasets. Finally, we present a simple setting where we can rigorously tie the phenomena we observe in practice to a misalignment between the (human-specified) notion of robustness and the inherent geometry of the data." ) ) , ( "https://arxiv.org/abs/1905.03197" , ( "Unified Language Model Pre-training for Natural Language Understanding\n and Generation" , "Li Dong, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng Gao, Ming Zhou, Hsiao-Wuen Hon" , "2019-08-27" , "" , "This paper presents a new Unified pre-trained Language Model (UniLM) that can be fine-tuned for both natural language understanding and generation tasks. The model is pre-trained using three types of language modeling objectives: unidirectional (both left-to-right and right-to-left), bidirectional, and sequence-to-sequence prediction. The unified modeling is achieved by employing a shared Transformer network and utilizing specific self-attention masks to control what context the prediction conditions on. We can fine-tune UniLM as a unidirectional decoder, a bidirectional encoder, or a sequence-to-sequence model to support various downstream natural language understanding and generation tasks. UniLM compares favorably with BERT on the GLUE benchmark, and the SQuAD 2.0 and CoQA question answering tasks. Moreover, our model achieves new state-of-the-art results on three natural language generation tasks, including improving the CNN/DailyMail abstractive summarization ROUGE-L to 40.63 (2.16 absolute improvement), pushing the CoQA generative question answering F1 score to 82.5 (37.1 absolute improvement), and the SQuAD question generation BLEU-4 to 22.88 (6.50 absolute improvement)." ) ) , ( "https://arxiv.org/abs/1905.12616" , ( "Defending Against Neural Fake News" , "Rowan Zellers, Ari Holtzman, Hannah Rashkin, Yonatan Bisk, Ali Farhadi, Franziska Roesner, Yejin Choi" , "2019-08-27" , "" , "Recent progress in natural language generation has raised dual-use concerns. While applications like summarization and translation are positive, the underlying technology also might enable adversaries to generate neural fake news: targeted propaganda that closely mimics the style of real news. Modern computer security relies on careful threat modeling: identifying potential threats and vulnerabilities from an adversary's point of view, and exploring potential mitigations to these threats. Likewise, developing robust defenses against neural fake news requires us first to carefully investigate and characterize the risks of these models. We thus present a model for controllable text generation called Grover. Given a headline like `Link Found Between Vaccines and Autism,' Grover can generate the rest of the article; humans find these generations to be more trustworthy than human-written disinformation. Developing robust verification techniques against generators like Grover is critical. We find that best current discriminators can classify neural fake news from real, human-written, news with 73% accuracy, assuming access to a moderate level of training data. Counterintuitively, the best defense against Grover turns out to be Grover itself, with 92% accuracy, demonstrating the importance of public release of strong generators. We investigate these results further, showing that exposure bias -- and sampling strategies that alleviate its effects -- both leave artifacts that similar discriminators can pick up on. We conclude by discussing ethical issues regarding the technology, and plan to release Grover publicly, helping pave the way for better detection of neural fake news." ) ) , ( "https://arxiv.org/abs/1906.08237" , ( "XLNet: Generalized Autoregressive Pretraining for Language Understanding" , "Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le" , "2019-08-27" , "" , "With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves better performance than pretraining approaches based on autoregressive language modeling. However, relying on corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into pretraining. Empirically, XLNet outperforms BERT on 20 tasks, often by a large margin, and achieves state-of-the-art results on 18 tasks including question answering, natural language inference, sentiment analysis, and document ranking." ) ) , ( "https://arxiv.org/abs/1907.00151" , ( "GPT-based Generation for Classical Chinese Poetry" , "Yi Liao, Yasheng Wang, Qun Liu, Xin Jiang" , "2020-01-25" , "" , "We present a simple yet effective method for generating high quality classical Chinese poetry with Generative Pre-trained Language Model (GPT). The method adopts a simple GPT model, without using any human crafted rules or features, or designing any additional neural components. While the proposed model learns to generate various forms of classical Chinese poems, including Jueju, L\\\"{u}shi, various Cipai and Couples, the generated poems are of very high quality. We also propose and implement a method to fine-tune the model to generate acrostic poetry. To the best of our knowledge, this is the first to employ GPT in developing a poetry generation system. We have released an online mini demonstration program on Wechat to show the generation capability of the proposed method for classical Chinese poetry." ) ) , ( "https://arxiv.org/abs/1908.04319" , ( "Neural Text Generation with Unlikelihood Training" , "Sean Welleck, Ilia Kulikov, Stephen Roller, Emily Dinan, Kyunghyun Cho, Jason Weston" , "2019-08-27" , "" , "Neural text generation is a key tool in natural language applications, but it is well known there are major problems at its core. In particular, standard likelihood training and decoding leads to dull and repetitive responses. While some post-hoc fixes have been proposed, in particular top-k and nucleus sampling, they do not address the fact that the token-level probabilities predicted by the model itself are poor. In this paper we show that the likelihood objective itself is at fault, resulting in a model that assigns too much probability to sequences that contain repeats and frequent words unlike the human training distribution. We propose a new objective, unlikelihood training, which forces unlikely generations to be assigned lower probability by the model. We show that both token and sequence level unlikelihood training give less repetitive, less dull text while maintaining perplexity, giving far superior generations using standard greedy or beam search. Our approach provides a strong alternative to traditional training." ) ) , ( "https://arxiv.org/abs/1909.01380" , ( "The Bottom-up Evolution of Representations in the Transformer: A Study\n with Machine Translation and Language Modeling Objectives" , "Elena Voita, Rico Sennrich, Ivan Titov" , "2020-01-25" , "" , "We seek to understand how the representations of individual tokens and the structure of the learned feature space evolve between layers in deep neural networks under different learning objectives. We focus on the Transformers for our analysis as they have been shown effective on various tasks, including machine translation (MT), standard left-to-right language models (LM) and masked language modeling (MLM). Previous work used black-box probing tasks to show that the representations learned by the Transformer differ significantly depending on the objective. In this work, we use canonical correlation analysis and mutual information estimators to study how information flows across Transformer layers and how this process depends on the choice of learning objective. For example, as you go from bottom to top layers, information about the past in left-to-right language models gets vanished and predictions about the future get formed. In contrast, for MLM, representations initially acquire information about the context around the token, partially forgetting the token identity and producing a more generalized token representation. The token identity then gets recreated at the top MLM layers." ) ) , ( "https://arxiv.org/abs/1909.10705" , ( "Do Massively Pretrained Language Models Make Better Storytellers?" , "Abigail See, Aneesh Pappu, Rohun Saxena, Akhila Yerukola, Christopher D. Manning" , "2020-01-25" , "" , "Large neural language models trained on massive amounts of text have emerged as a formidable strategy for Natural Language Understanding tasks. However, the strength of these models as Natural Language Generators is less clear. Though anecdotal evidence suggests that these models generate better quality text, there has been no detailed study characterizing their generation abilities. In this work, we compare the performance of an extensively pretrained model, OpenAI GPT2-117 (Radford et al., 2019), to a state-of-the-art neural story generation model (Fan et al., 2018). By evaluating the generated text across a wide variety of automatic metrics, we characterize the ways in which pretrained models do, and do not, make better storytellers. We find that although GPT2-117 conditions more strongly on context, is more sensitive to ordering of events, and uses more unusual words, it is just as likely to produce repetitive and under-diverse text when using likelihood-maximizing decoding algorithms." ) ) , ( "https://arxiv.org/abs/1910.10683" , ( "Exploring the Limits of Transfer Learning with a Unified Text-to-Text\n Transformer" , "Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu" , "2020-01-25" , "" , "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new \"Colossal Clean Crawled Corpus\", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code." ) ) , ( "https://arxiv.org/abs/1911.00536" , ( "DialoGPT: Large-Scale Generative Pre-training for Conversational\n Response Generation" , "Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan" , "2020-01-25" , "" , "We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained transformer). Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human both in terms of automatic and human evaluation in single-turn dialogue settings. We show that conversational systems that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response generation and the development of more intelligent open-domain dialogue systems." ) ) , ( "https://arxiv.org/abs/1911.05507" , ( "Compressive Transformers for Long-Range Sequence Modelling" , "Jack W. Rae, Anna Potapenko, Siddhant M. Jayakumar, Timothy P. Lillicrap" , "2020-01-25" , "" , "We present the Compressive Transformer, an attentive sequence model which compresses past memories for long-range sequence learning. We find the Compressive Transformer obtains state-of-the-art language modelling results in the WikiText-103 and Enwik8 benchmarks, achieving 17.1 ppl and 0.97 bpc respectively. We also find it can model high-frequency speech effectively and can be used as a memory mechanism for RL, demonstrated on an object matching task. To promote the domain of long-range sequence learning, we propose a new open-vocabulary language modelling benchmark derived from books, PG-19." ) ) , ( "https://arxiv.org/abs/1911.08265#deepmind" , ( "Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model" , "Julian Schrittwieser, Ioannis Antonoglou, Thomas Hubert, Karen Simonyan, Laurent Sifre, Simon Schmitt, Arthur Guez, Edward Lockhart, Demis Hassabis, Thore Graepel, Timothy Lillicrap, David Silver" , "2020-01-25" , "" , "Constructing agents with planning capabilities has long been one of the main challenges in the pursuit of artificial intelligence. Tree-based planning methods have enjoyed huge success in challenging domains, such as chess and Go, where a perfect simulator is available. However, in real-world problems the dynamics governing the environment are often complex and unknown. In this work we present the MuZero algorithm which, by combining a tree-based search with a learned model, achieves superhuman performance in a range of challenging and visually complex domains, without any knowledge of their underlying dynamics. MuZero learns a model that, when applied iteratively, predicts the quantities most directly relevant to planning: the reward, the action-selection policy, and the value function. When evaluated on 57 different Atari games - the canonical video game environment for testing AI techniques, in which model-based planning approaches have historically struggled - our new algorithm achieved a new state of the art. When evaluated on Go, chess and shogi, without any knowledge of the game rules, MuZero matched the superhuman performance of the AlphaZero algorithm that was supplied with the game rules." ) ) , ( "https://arxiv.org/abs/2001.04451#googlebrain" , ( "Reformer: The Efficient Transformer" , "Nikita Kitaev, \321ukasz Kaiser, Anselm Levskaya" , "2020-01-25" , "" , "Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its complexity from O($L^2$) to O($L\\log L$), where $L$ is the length of the sequence. Furthermore, we use reversible residual layers instead of the standard residuals, which allows storing activations only once in the training process instead of $N$ times, where $N$ is the number of layers. The resulting model, the Reformer, performs on par with Transformer models while being much more memory-efficient and much faster on long sequences." ) ) , ( "https://arxiv.org/pdf/1706.03741.pdf#page=15" , ( "" , "" , "" , "" , "" ) ) , ( "https://arxiv.org/pdf/1809.11096.pdf#page=6" , ( "" , "" , "" , "" , "" ) ) , ( "https://ask-gpt.tumblr.com/" , ( "" , "" , "" , "" , "" ) ) , ( "https://ask-gpt.tumblr.com/post/183402346117/april-is-the-cruelest-month-breeding-n-lilacs" , ( "" , "" , "" , "" , "" ) ) , ( "https://blog.acolyer.org/2018/02/22/dynamic-word-embeddings-for-evolving-semantic-discovery/" , ( "" , "" , "" , "" , "" ) ) , ( "https://blog.floydhub.com/the-transformer-in-pytorch/" , ( "" , "" , "" , "" , "" ) ) , ( "https://boingboing.net/2019/03/15/digital-lit.html" , ( "" , "" , "" , "" , "" ) ) , ( "https://cloud.google.com/tpu/pricing" , ( "" , "" , "" , "" , "" ) ) , ( "https://colab.research.google.com/drive/1BXry0kcm869-RVHHiY6NZmY9uBzbkf1Q" , ( "" , "" , "" , "" , "" ) ) , ( "https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf#openai" , ( "" , "" , "" , "" , "" ) ) , ( "https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf#page=4" , ( "" , "" , "" , "" , "" ) ) , ( "https://d4mucfpksywv.cloudfront.net/papers/GPT_2_Report.pdf#openai" , ( "" , "" , "" , "" , "" ) ) , ( "https://dantkz.github.io/How-To-Debug-A-Memory-Leak-In-TensorFlow/" , ( "" , "" , "" , "" , "" ) ) , ( "https://decaut.org/situ/index.php/ttc-compilation/" , ( "" , "" , "" , "" , "" ) ) , ( "https://deepmind.com/blog/alphastar-mastering-real-time-strategy-game-starcraft-ii/" , ( "" , "" , "" , "" , "" ) ) , ( "https://distill.pub/2017/momentum/" , ( "" , "" , "" , "" , "" ) ) , ( "https://einstein.ai/presentations/ctrl.pdf" , ( "" , "" , "" , "" , "" ) ) , ( "https://en.wikipedia.org/wiki/Amazon_S3" , ( "Amazon S3" , "English Wikipedia" , "" , "" , "

Amazon S3 or Amazon Simple Storage Service is a service offered by Amazon Web Services (AWS) that provides object storage through a web service interface. Amazon S3 uses the same scalable storage infrastructure that Amazon.com uses to run its global e-commerce network.

" ) ) , ( "https://en.wikipedia.org/wiki/Bouba-Kiki_effect" , ( "Bouba/kiki effect" , "English Wikipedia" , "" , "" , "

The bouba/kiki effect is a non-arbitrary mapping between speech sounds and the visual shape of objects. This effect was first observed by German-American psychologist Wolfgang K\246hler in 1929. In psychological experiments first conducted on the island of Tenerife, K\246hler showed forms similar to those shown at the right and asked participants which shape was called \"takete\" and which was called \"baluba\". Although not explicitly stated, K\246hler implies that there was a strong preference to pair the jagged shape with \"takete\" and the rounded shape with \"baluba\".

" ) ) , ( "https://en.wikipedia.org/wiki/Bradley-Terry_model" , ( "Bradley\8211Terry model" , "English Wikipedia" , "" , "" , "

The Bradley\8211Terry model is a probability model that can predict the outcome of a paired comparison. Given a pair of individuals i and j drawn from some population, it estimates the probability that the pairwise comparison i > j turns out true, as

\n
" ) ) , ( "https://en.wikipedia.org/wiki/CloudFlare" , ( "Cloudflare" , "English Wikipedia" , "" , "" , "

Cloudflare, Inc. is an American web infrastructure and website security company, providing content delivery network services, DDoS mitigation, Internet security, and distributed domain name server services. Cloudflare's services sit between a website's visitor and the Cloudflare user's hosting provider, acting as a reverse proxy for websites. Cloudflare's headquarters are in San Francisco, California, with additional offices in Lisbon, London, Singapore, Munich, San Jose, Champaign, Illinois, Austin, New York City and Washington, D.C.

" ) ) , ( "https://en.wikipedia.org/wiki/Content_delivery_network" , ( "Content delivery network" , "English Wikipedia" , "" , "" , "

A content delivery network or content distribution network (CDN) is a geographically distributed network of proxy servers and their data centers. The goal is to provide high availability and high performance by distributing the service spatially relative to end-users. CDNs serve a large portion of the Internet content today, including web objects, downloadable objects, applications, live streaming media, on-demand streaming media, and social media sites.

" ) ) , ( "https://en.wikipedia.org/wiki/Creative_Commons" , ( "Creative Commons" , "English Wikipedia" , "" , "" , "

" ) ) , ( "https://en.wikipedia.org/wiki/Dune_Messiah" , ( "Dune Messiah" , "English Wikipedia" , "" , "" , "

Dune Messiah is a science fiction novel by American writer Frank Herbert, the second in his Dune series of six novels. It was originally serialized in Galaxy magazine in 1969. The American and British editions have different prologues summarizing events in the previous novel. Dune Messiah and its sequel Children of Dune were collectively adapted by the Sci-Fi Channel in 2003 into a miniseries entitled Frank Herbert's Children of Dune. In 2002, the Science Fiction Book Club also published the two novels in one volume.

" ) ) , ( "https://en.wikipedia.org/wiki/Elegy_Written_in_a_Country_Churchyard" , ( "Elegy Written in a Country Churchyard" , "English Wikipedia" , "" , "" , "

Elegy Written in a Country Churchyard is a poem by Thomas Gray, completed in 1750 and first published in 1751. The poem's origins are unknown, but it was partly inspired by Gray's thoughts following the death of the poet Richard West in 1742. Originally titled Stanzas Wrote in a Country Church-Yard, the poem was completed when Gray was living near St Giles' parish church at Stoke Poges. It was sent to his friend Horace Walpole, who popularised the poem among London literary circles. Gray was eventually forced to publish the work on 15 February 1751 in order to preempt a magazine publisher from printing an unlicensed copy of the poem.

" ) ) , ( "https://en.wikipedia.org/wiki/Essay_On_Criticism" , ( "An Essay on Criticism" , "English Wikipedia" , "" , "" , "

An Essay on Criticism is one of the first major poems written by the English writer Alexander Pope (1688\8211\&1744). It is the source of the famous quotations \"To err is human, to forgive divine,\" \"A little learning is a dang'rous thing\", and \"Fools rush in where angels fear to tread.\" It first appeared in 1711 after having been written in 1709, and it is clear from Pope's correspondence that many of the poem's ideas had existed in prose form since at least 1706. Composed in heroic couplets and written in the Horatian mode of satire, it is a verse essay primarily concerned with how writers and critics behave in the new literary commerce of Pope's contemporary age. The poem covers a range of good criticism and advice, and represents many of the chief literary ideals of Pope's age.

" ) ) , ( "https://en.wikipedia.org/wiki/Frank_Herbert" , ( "Frank Herbert" , "English Wikipedia" , "" , "" , "

Franklin Patrick Herbert Jr. was an American science-fiction author best known for the 1965 novel Dune and its five sequels. Though he became famous for his novels, he also wrote short stories and worked as a newspaper journalist, photographer, book reviewer, ecological consultant, and lecturer.

" ) ) , ( "https://en.wikipedia.org/wiki/Heartbeat_%28computing%29" , ( "Heartbeat (computing)" , "English Wikipedia" , "" , "" , "

In computer science, a heartbeat is a periodic signal generated by hardware or software to indicate normal operation or to synchronize other parts of a computer system. Usually a heartbeat is sent between machines at a regular interval in the order of seconds. If the endpoint does not receive a heartbeat for a time\8212usually a few heartbeat intervals\8212the machine that should have sent the heartbeat is assumed to have failed.

" ) ) , ( "https://en.wikipedia.org/wiki/Invictus" , ( "Invictus" , "English Wikipedia" , "" , "" , "

\"Invictus\" is a short Victorian poem by the English poet William Ernest Henley (1849\8211\&1903). It was written in 1875 and published in 1888 in his first volume of poems, Book of Verses, in the section Life and Death (Echoes). It shows how Henley never lost hope and kept faith in himself and faced the struggles unafraid.

" ) ) , ( "https://en.wikipedia.org/wiki/Jabberwocky" , ( "Jabberwocky" , "English Wikipedia" , "" , "" , "

\"Jabberwocky\" is a nonsense poem written by Lewis Carroll about the killing of a creature named \"the Jabberwock\". It was included in his 1871 novel Through the Looking-Glass, and What Alice Found There, the sequel to Alice's Adventures in Wonderland. The book tells of Alice's adventures within the back-to-front world of Looking-Glass Land.

" ) ) , ( "https://en.wikipedia.org/wiki/John_Keats" , ( "John Keats" , "English Wikipedia" , "" , "" , "

John Keats was an English Romantic poet. He was one of the main figures of the second generation of Romantic poets, along with Lord Byron and Percy Bysshe Shelley, despite his works having been in publication for only four years before his death from tuberculosis at the age of 25.

" ) ) , ( "https://en.wikipedia.org/wiki/Kalevala" , ( "Kalevala" , "English Wikipedia" , "" , "" , "

The Kalevala is a 19th-century work of epic poetry compiled by Elias L\246nnrot from Karelian and Finnish oral folklore and mythology.

" ) ) , ( "https://en.wikipedia.org/wiki/Moravec%27s_paradox" , ( "Moravec's paradox" , "English Wikipedia" , "" , "" , "

Moravec's paradox is the observation by artificial intelligence and robotics researchers that, contrary to traditional assumptions, reasoning requires very little computation, but sensorimotor skills require enormous computational resources. The principle was articulated by Hans Moravec, Rodney Brooks, Marvin Minsky and others in the 1980s. As Moravec writes, \"it is comparatively easy to make computers exhibit adult level performance on intelligence tests or playing checkers, and difficult or impossible to give them the skills of a one-year-old when it comes to perception and mobility\".

" ) ) , ( "https://en.wikipedia.org/wiki/Oda_Nobunaga" , ( "Oda Nobunaga" , "English Wikipedia" , "" , "" , "

Oda Nobunaga was a Japanese daimy\333 in the late 16th century who attempted to unify Japan during the late Sengoku period, and successfully gained control over most of Honshu through conquest. Nobunaga is regarded as one of three unifiers of Japan along with his retainers Toyotomi Hideyoshi and Tokugawa Ieyasu. During his later life, Nobunaga was widely known for most brutal suppression of determined opponents, eliminating those who by principle refused to cooperate or yield to his demands. His reign was noted for innovative military tactics, fostering free trade, and encouraging the start of the Momoyama historical art period. He was killed when his retainer Akechi Mitsuhide rebelled against him at Honn\333-ji.

" ) ) , ( "https://en.wikipedia.org/wiki/Ozymandias" , ( "Ozymandias" , "English Wikipedia" , "" , "" , "

\"Ozymandias\" is the title of two related sonnets published in 1818.

" ) ) , ( "https://en.wikipedia.org/wiki/Pioneers%21_O_Pioneers%21" , ( "Pioneers! O Pioneers!" , "English Wikipedia" , "" , "" , "

\"Pioneers! O Pioneers!\" is a poem by the American poet Walt Whitman. It was first published in Leaves of Grass in 1865. The poem was written as a tribute to Whitman's fervor for the great Westward expansion in the United States that led to things like the California Gold Rush and exploration of the far west.

" ) ) , ( "https://en.wikipedia.org/wiki/Poetry_Foundation" , ( "Poetry Foundation" , "English Wikipedia" , "" , "" , "

The Poetry Foundation is a Chicago-based American foundation created to promote poetry in the wider culture. It was formed from Poetry magazine, which it continues to publish, with a 2003 gift of \$200 million from philanthropist Ruth Lilly.

" ) ) , ( "https://en.wikipedia.org/wiki/Portable_Game_Notation" , ( "Portable Game Notation" , "English Wikipedia" , "" , "" , "

Portable Game Notation (PGN) is a plain text computer-processible format for recording chess games, supported by many chess programs.

" ) ) , ( "https://en.wikipedia.org/wiki/RACTER" , ( "Racter" , "English Wikipedia" , "" , "" , "

Racter is an artificial intelligence computer program that generates English language prose at random.

" ) ) , ( "https://en.wikipedia.org/wiki/Sonnet_29" , ( "Sonnet 29" , "English Wikipedia" , "" , "" , "

Sonnet 29 is one of 154 sonnets written by the English playwright and poet William Shakespeare. It is part of the Fair Youth sequence. In the sonnet, the speaker bemoans his status as an outcast and failure but feels better upon thinking of his beloved. Sonnet 29 is written in the typical Shakespearean sonnet form, having 14 lines of iambic pentameter ending in a rhymed couplet.

" ) ) , ( "https://en.wikipedia.org/wiki/Tao_Te_Ching" , ( "Tao Te Ching" , "English Wikipedia" , "" , "" , "

The Tao Te Ching, Chinese: \36947\24503\32463; pinyin: Dao De Jing), also known as Lao Tzu or Laozi, is a Chinese classic text traditionally credited to the 6th-century BC sage Laozi. The text's authorship, date of composition and date of compilation are debated. The oldest excavated portion dates back to the late 4th century BC, but modern scholarship dates other parts of the text as having been written\8212or at least compiled\8212later than the earliest portions of the Zhuangzi.

" ) ) , ( "https://en.wikipedia.org/wiki/The_Cyberiad" , ( "The Cyberiad" , "English Wikipedia" , "" , "" , "

The Cyberiad is a series of humorous science fiction short stories by Polish writer Stanis\322aw Lem, originally published in 1965, with an English translation appearing in 1974. The main protagonists of the series are Trurl and Klapaucius, the \"constructors\".

" ) ) , ( "https://en.wikipedia.org/wiki/The_Fall_of_Hyperion%3A_A_Dream" , ( "The Fall of Hyperion: A Dream" , "English Wikipedia" , "" , "" , "

The Fall of Hyperion: A Dream, sometimes subtitled as A Vision instead of a dream, is an epic poem written by the English Romantic John Keats. Keats composed The Fall of Hyperion by reworking, expanding, and personally narrating lines from his earlier fragmented epic poem Hyperion.

" ) ) , ( "https://en.wikipedia.org/wiki/The_Rime_of_the_Ancient_Mariner" , ( "The Rime of the Ancient Mariner" , "English Wikipedia" , "" , "" , "

The Rime of the Ancient Mariner is the longest major poem by the English poet Samuel Taylor Coleridge, written in 1797\8211\&98 and published in 1798 in the first edition of Lyrical Ballads. Some modern editions use a revised version printed in 1817 that featured a gloss. Along with other poems in Lyrical Ballads, it is often considered a signal shift to modern poetry and the beginning of British Romantic literature.

" ) ) , ( "https://en.wikipedia.org/wiki/The_Song_of_Hiawatha" , ( "The Song of Hiawatha" , "English Wikipedia" , "" , "" , "

The Song of Hiawatha is an 1855 epic poem in trochaic tetrameter by Henry Wadsworth Longfellow which features Native American characters. The epic relates the fictional adventures of an Ojibwe warrior named Hiawatha and the tragedy of his love for Minnehaha, a Dakota woman. Events in the story are set in the Pictured Rocks area on the south shore of Lake Superior. Longfellow's poem is based on oral traditions surrounding the figure of Manabozho, but it also contains his own innovations.

" ) ) , ( "https://en.wikipedia.org/wiki/Thomas_Gray" , ( "Thomas Gray" , "English Wikipedia" , "" , "" , "

Thomas Gray was an English poet, letter-writer, classical scholar, and professor at Pembroke College, Cambridge. He is widely known for his Elegy Written in a Country Churchyard, published in 1751.

" ) ) , ( "https://en.wikipedia.org/wiki/Watchdog_timer" , ( "Watchdog timer" , "English Wikipedia" , "" , "" , "

A watchdog timer is an electronic timer that is used to detect and recover from computer malfunctions. During normal operation, the computer regularly resets the watchdog timer to prevent it from elapsing, or \"timing out\". If, due to a hardware fault or program error, the computer fails to reset the watchdog, the timer will elapse and generate a timeout signal. The timeout signal is used to initiate corrective action or actions. The corrective actions typically include placing the computer system in a safe state and restoring normal system operation.

" ) ) , ( "https://en.wikipedia.org/wiki/beam_search" , ( "Beam search" , "English Wikipedia" , "" , "" , "

In computer science, beam search is a heuristic search algorithm that explores a graph by expanding the most promising node in a limited set. Beam search is an optimization of best-first search that reduces its memory requirements. Best-first search is a graph search which orders all partial solutions (states) according to some heuristic. But in beam search, only a predetermined number of best partial solutions are kept as candidates. It is thus a greedy algorithm.

" ) ) , ( "https://en.wikipedia.org/wiki/bfloat16_floating-point_format" , ( "Bfloat16 floating-point format" , "English Wikipedia" , "" , "" , "

The bfloat16 floating-point format is a computer number format occupying 16 bits in computer memory; it represents a wide dynamic range of numeric values by using a floating radix point. This format is a truncated (16-bit) version of the 32-bit IEEE 754 single-precision floating-point format (binary32) with the intent of accelerating machine learning and near-sensor computing. It preserves the approximate dynamic range of 32-bit floating-point numbers by retaining 8 exponent bits, but supports only an 8-bit precision rather than the 24-bit significand of the binary32 format. More so than single-precision 32-bit floating-point numbers, bfloat16 numbers are unsuitable for integer calculations, but this is not their intended use.

" ) ) , ( "https://en.wikipedia.org/wiki/bias%C3%A2%E2%82%AC%E2%80%9Cvariance_tradeoff" , ( "" , "" , "" , "" , "" ) ) , ( "https://en.wikipedia.org/wiki/liability-threshold_model" , ( "Threshold model" , "English Wikipedia" , "" , "" , "

In mathematical or statistical modeling a threshold model is any model where a threshold value, or set of threshold values, is used to distinguish ranges of values where the behaviour predicted by the model varies in some important way. A particularly important instance arises in toxicology, where the model for the effect of a drug may be that there is zero effect for a dose below a critical or threshold value, while an effect of some significance exists above that value. Certain types of regression model may include threshold effects.

" ) ) , ( "https://en.wikipedia.org/wiki/mojibake" , ( "Mojibake" , "English Wikipedia" , "" , "" , "

Mojibake is the garbled text that is the result of text being decoded using an unintended character encoding. The result is a systematic replacement of symbols with completely unrelated ones, often from a different writing system.

" ) ) , ( "https://en.wikipedia.org/wiki/stylometrics" , ( "Stylometry" , "English Wikipedia" , "" , "" , "

Stylometry is the application of the study of linguistic style, usually to written language, but it has successfully been applied to music and to fine-art paintings as well. Another conceptualization defines it as the linguistic discipline that uses statistical analysis to literature by evaluating the author's style through various quantitative criteria.