presentation_1.bib

%% LaTeX2e file `presentation_1.bib'
%% generated by the `filecontents' environment
%% from source `presentation_1' on 2020/04/15.
%%
    @misc{howard2018universal,
    title={Universal Language Model Fine-tuning for Text Classification},
    author={Jeremy Howard and Sebastian Ruder},
    year={2018},
    eprint={1801.06146},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@misc{vaswani2017attention,
    title={Attention Is All You Need},
    author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
    year={2017},
    eprint={1706.03762},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@misc{sun2019finetune,
    title={How to Fine-Tune BERT for Text Classification?},
    author={Chi Sun and Xipeng Qiu and Yige Xu and Xuanjing Huang},
    year={2019},
    eprint={1905.05583},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@inproceedings{Gray2017GPUKF,
  title={GPU Kernels for Block-Sparse Weights},
  author={Scott Gray and Alec Radford and Diederik P. Kingma},
  year={2017}
}
@misc{yang2019xlnet,
    title={XLNet: Generalized Autoregressive Pretraining for Language Understanding},
    author={Zhilin Yang and Zihang Dai and Yiming Yang and Jaime Carbonell and Ruslan Salakhutdinov and Quoc V. Le},
    year={2019},
    eprint={1906.08237},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{devlin2018bert,
    title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
    author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
    year={2018},
    eprint={1810.04805},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{radford2019language,
  title={Language Models are Unsupervised Multitask Learners},
  author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  year={2019}
}

@misc{shoeybi2019megatronlm,
    title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
    author={Mohammad Shoeybi and Mostofa Patwary and Raul Puri and Patrick LeGresley and Jared Casper and Bryan Catanzaro},
    year={2019},
    eprint={1909.08053},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{hochreiter1997long,
  added-at = {2016-11-15T08:49:43.000+0100},
  author = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  biburl = {https://www.bibsonomy.org/bibtex/2a4a80026d24955b267cae636aa8abe4a/dallmann},
  interhash = {0692b471c4b9ae65d00affebc09fb467},
  intrahash = {a4a80026d24955b267cae636aa8abe4a},
  journal = {Neural computation},
  keywords = {lstm rnn},
  number = 8,
  pages = {1735--1780},
  publisher = {MIT Press},
  timestamp = {2016-11-15T08:49:43.000+0100},
  title = {Long short-term memory},
  volume = 9,
  year = 1997
}

@misc{wang2018glue,
    title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
    author={Alex Wang and Amanpreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R. Bowman},
    year={2018},
    eprint={1804.07461},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{peters2018deep,
    title={Deep contextualized word representations},
    author={Matthew E. Peters and Mark Neumann and Mohit Iyyer and Matt Gardner and Christopher Clark and Kenton Lee and Luke Zettlemoyer},
    year={2018},
    eprint={1802.05365},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@inproceedings{wordPiece,
    title = {Japanese and Korean Voice Search},
    author = {Mike Schuster and Kaisuke Nakajima},
    year = {2012},
    booktitle = {International Conference on Acoustics, Speech and Signal Processing},
    pages = {5149--5152}
}

@incollection{mikolov2013,
    title = {Distributed Representations of Words and Phrases and their Compositionality},
    author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
    booktitle = {Advances in Neural Information Processing Systems 26},
    editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
    pages = {3111--3119},
    year = {2013},
    publisher = {Curran Associates, Inc.},
    url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf}
}

@inproceedings{penningtonglove,
    title = "{G}love: Global Vectors for Word Representation",
    author = "Pennington, Jeffrey  and
      Socher, Richard  and
      Manning, Christopher",
    booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
    month = oct,
    year = "2014",
    address = "Doha, Qatar",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D14-1162",
    doi = "10.3115/v1/D14-1162",
    pages = "1532--1543",
}

@inproceedings{Radford2018ImprovingLU,
  title={Improving Language Understanding by Generative Pre-Training},
  author={Alec Radford},
  year={2018}
}

@inproceedings{mikolov2018advances,
  title={Advances in Pre-Training Distributed Word Representations},
  author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
  booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
  year={2018}
}