-
Notifications
You must be signed in to change notification settings - Fork 4
/
presentation_1.bib
147 lines (136 loc) · 5.27 KB
/
presentation_1.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
%% LaTeX2e file `presentation_1.bib'
%% generated by the `filecontents' environment
%% from source `presentation_1' on 2020/04/15.
%%
@misc{howard2018universal,
title={Universal Language Model Fine-tuning for Text Classification},
author={Jeremy Howard and Sebastian Ruder},
year={2018},
eprint={1801.06146},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{vaswani2017attention,
title={Attention Is All You Need},
author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
year={2017},
eprint={1706.03762},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{sun2019finetune,
title={How to Fine-Tune BERT for Text Classification?},
author={Chi Sun and Xipeng Qiu and Yige Xu and Xuanjing Huang},
year={2019},
eprint={1905.05583},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{Gray2017GPUKF,
title={GPU Kernels for Block-Sparse Weights},
author={Scott Gray and Alec Radford and Diederik P. Kingma},
year={2017}
}
@misc{yang2019xlnet,
title={XLNet: Generalized Autoregressive Pretraining for Language Understanding},
author={Zhilin Yang and Zihang Dai and Yiming Yang and Jaime Carbonell and Ruslan Salakhutdinov and Quoc V. Le},
year={2019},
eprint={1906.08237},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{devlin2018bert,
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
year={2018},
eprint={1810.04805},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{radford2019language,
title={Language Models are Unsupervised Multitask Learners},
author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
year={2019}
}
@misc{shoeybi2019megatronlm,
title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
author={Mohammad Shoeybi and Mostofa Patwary and Raul Puri and Patrick LeGresley and Jared Casper and Bryan Catanzaro},
year={2019},
eprint={1909.08053},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{hochreiter1997long,
added-at = {2016-11-15T08:49:43.000+0100},
author = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
biburl = {https://www.bibsonomy.org/bibtex/2a4a80026d24955b267cae636aa8abe4a/dallmann},
interhash = {0692b471c4b9ae65d00affebc09fb467},
intrahash = {a4a80026d24955b267cae636aa8abe4a},
journal = {Neural computation},
keywords = {lstm rnn},
number = 8,
pages = {1735--1780},
publisher = {MIT Press},
timestamp = {2016-11-15T08:49:43.000+0100},
title = {Long short-term memory},
volume = 9,
year = 1997
}
@misc{wang2018glue,
title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
author={Alex Wang and Amanpreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R. Bowman},
year={2018},
eprint={1804.07461},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{peters2018deep,
title={Deep contextualized word representations},
author={Matthew E. Peters and Mark Neumann and Mohit Iyyer and Matt Gardner and Christopher Clark and Kenton Lee and Luke Zettlemoyer},
year={2018},
eprint={1802.05365},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{wordPiece,
title = {Japanese and Korean Voice Search},
author = {Mike Schuster and Kaisuke Nakajima},
year = {2012},
booktitle = {International Conference on Acoustics, Speech and Signal Processing},
pages = {5149--5152}
}
@incollection{mikolov2013,
title = {Distributed Representations of Words and Phrases and their Compositionality},
author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
booktitle = {Advances in Neural Information Processing Systems 26},
editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
pages = {3111--3119},
year = {2013},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf}
}
@inproceedings{penningtonglove,
title = "{G}love: Global Vectors for Word Representation",
author = "Pennington, Jeffrey and
Socher, Richard and
Manning, Christopher",
booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
month = oct,
year = "2014",
address = "Doha, Qatar",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D14-1162",
doi = "10.3115/v1/D14-1162",
pages = "1532--1543",
}
@inproceedings{Radford2018ImprovingLU,
title={Improving Language Understanding by Generative Pre-Training},
author={Alec Radford},
year={2018}
}
@inproceedings{mikolov2018advances,
title={Advances in Pre-Training Distributed Word Representations},
author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}