bibliography.bib

@misc{https://doi.org/10.48550/arxiv.1603.09382,
  title={Deep Networks with Stochastic Depth},
  author={Gao Huang and Yu Sun and Zhuang Liu and Daniel Sedra and Kilian Weinberger},
  year={2016},
  eprint={1603.09382},
  archivePrefix={arXiv},
  primaryClass={cs.LG},
  url={https://arxiv.org/abs/1603.09382},
}

@misc{https://doi.org/10.48550/arxiv.1607.06450,
  title={Layer Normalization},
  author={Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton},
  year={2016},
  eprint={1607.06450},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@misc{https://doi.org/10.48550/arxiv.1609.03499,
  title={WaveNet: A Generative Model for Raw Audio},
  author={Aaron van den Oord and Sander Dieleman and Heiga Zen and Karen Simonyan and Oriol Vinyals and Alex Graves and Nal Kalchbrenner and Andrew Senior and Koray Kavukcuoglu},
  year={2016},
  eprint={1609.03499},
  archivePrefix={arXiv},
  primaryClass={cs.SD}
}

@misc{https://doi.org/10.48550/arxiv.1608.03983,
  title={SGDR: Stochastic Gradient Descent with Warm Restarts},
  author={Ilya Loshchilov and Frank Hutter},
  year={2017},
  eprint={1608.03983},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arXiv.1612.08083,
  title={Language Modeling with Gated Convolutional Networks},
  author={Yann N. Dauphin and Angela Fan and Michael Auli and David Grangier},
  year={2017},
  eprint={1612.08083},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/1612.08083},
}

@misc{https://doi.org/10.48550/arxiv.1706.03762,
  doi={10.48550/ARXIV.1706.03762},
  url={https://arxiv.org/abs/1706.03762},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
  keywords={Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={Attention Is All You Need},
  publisher={arXiv},
  year={2017},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.1803.02155,
  title={Self-Attention with Relative Position Representations},
  author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
  year={2018},
  eprint={1803.02155},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.1901.02860,
  title={Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context}, 
  author={Zihang Dai and Zhilin Yang and Yiming Yang and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},
  year={2019},
  eprint={1901.02860},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.1904.08779,
   title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
   author={Park, Daniel S. and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D. and Le, Quoc V.},
   year={2019},
   booktitle={Interspeech 2019},
   publisher={ISCA},
}

@misc{https://doi.org/10.48550/arxiv.1904.09751,
  title={The Curious Case of Neural Text Degeneration}, 
  author={Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi},
  year={2020},
  eprint={1904.09751},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.1909.11556,
  doi={10.48550/ARXIV.1909.11556},
  url={https://arxiv.org/abs/1909.11556},
  author={Fan, Angela and Grave, Edouard and Joulin, Armand},
  keywords={Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={Reducing Transformer Depth on Demand with Structured Dropout},
  publisher={arXiv},
  year={2019},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.1910.07467,
  title={Root Mean Square Layer Normalization},
  author={Biao Zhang and Rico Sennrich},
  year={2019},
  eprint={1910.07467},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.1911.08460,
  doi={10.48550/ARXIV.1911.08460},
  url={https://arxiv.org/abs/1911.08460},
  author={Synnaeve, Gabriel and Xu, Qiantong and Kahn, Jacob and Likhomanenko, Tatiana and Grave, Edouard and Pratap, Vineel and Sriram, Anuroop and Liptchinsky, Vitaliy and Collobert, Ronan},
  keywords={Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
  title={End-to-end ASR: from Supervised to Semi-Supervised Learning with Modern Architectures},
  publisher={arXiv},
  year={2019},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2002.04745,
  doi={10.48550/ARXIV.2002.04745},
  url={https://arxiv.org/abs/2002.04745},
  author={Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
  keywords={Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={On Layer Normalization in the Transformer Architecture},
  publisher={arXiv},
  year={2020},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2002.05202,
  title={GLU Variants Improve Transformer},
  author={Noam Shazeer},
  year={2020},
  eprint={2002.05202},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.2004.05150,
  title={Longformer: The Long-Document Transformer},
  author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
  year={2020},
  eprint={2004.05150},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2005.08100,
  doi={10.48550/ARXIV.2005.08100},
  url={https://arxiv.org/abs/2005.08100},
  author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
  keywords={Audio and Speech Processing (eess.AS), Machine Learning (cs.LG), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={Conformer: Convolution-augmented Transformer for Speech Recognition},
  publisher={arXiv},
  year={2020},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2006.11477,
   title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
   author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
   year={2020},
   eprint={2006.11477},
   archivePrefix={arXiv},
   primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2010.05171,
  doi={10.48550/ARXIV.2010.05171},
  url={https://arxiv.org/abs/2010.05171},
  author={Wang, Changhan and Tang, Yun and Ma, Xutai and Wu, Anne and Popuri, Sravya and Okhonko, Dmytro and Pino, Juan},
  keywords={Computation and Language (cs.CL), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
  title={fairseq S2T: Fast Speech-to-Text Modeling with fairseq},
  publisher={arXiv},
  year={2020},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2104.09864,
  doi={10.48550/ARXIV.2104.09864},
  url={https://arxiv.org/abs/2104.09864},
  author={Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
  keywords={Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
  publisher={arXiv},
  year={2021},
  copyright={Creative Commons Attribution Non Commercial No Derivatives 4.0 International}
}

@misc{https://doi.org/10.48550/arxiv.2108.06209,
  title={W2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training},
  author={Yu-An Chung and Yu Zhang and Wei Han and Chung-Cheng Chiu and James Qin and Ruoming Pang and Yonghui Wu},
  year={2021},
  eprint={2108.06209},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.2108.12409,
  doi={10.48550/ARXIV.2108.12409},
  url={https://arxiv.org/abs/2108.12409},
  author={Press, Ofir and Smith, Noah A. and Lewis, Mike},
  keywords={Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
  publisher={arXiv},
  year={2021},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arXiv.2010.11929,
   title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, 
   author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
   year={2021},
   eprint={2010.11929},
   archivePrefix={arXiv},
   primaryClass={cs.CV},
   url={https://arxiv.org/abs/2010.11929},
}

@misc{https://doi.org/10.48550/arxiv.2110.09456,
  doi={10.48550/ARXIV.2110.09456},
  url={https://arxiv.org/abs/2110.09456},
  author={Shleifer, Sam and Weston, Jason and Ott, Myle},
  keywords={Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title={NormFormer: Improved Transformer Pretraining with Extra Normalization},
  publisher={arXiv},
  year={2021},
  copyright={arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2207.04672,
  doi={10.48550/arxiv.2207.04672},
  url={https://arxiv.org/abs/2207.04672},
  title={No Language Left Behind: Scaling Human-Centered Machine Translation},
  author={NLLB Team and Marta R. Costa-jussà and James Cross and Onur Çelebi and Maha Elbayad and Kenneth Heafield and Kevin Heffernan and Elahe Kalbassi and Janice Lam and Daniel Licht and Jean Maillard and Anna Sun and Skyler Wang and Guillaume Wenzek and Al Youngblood and Bapi Akula and Loic Barrault and Gabriel Mejia Gonzalez and Prangthip Hansanti and John Hoffman and Semarley Jarrett and Kaushik Ram Sadagopan and Dirk Rowe and Shannon Spruit and Chau Tran and Pierre Andrews and Necip Fazil Ayan and Shruti Bhosale and Sergey Edunov and Angela Fan and Cynthia Gao and Vedanuj Goswami and Francisco Guzmán and Philipp Koehn and Alexandre Mourachko and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Jeff Wang},
  year={2022},
  eprint={2207.04672},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2212.08055,
  title={UnitY: Two-pass Direct Speech-to-speech Translation with Discrete Units}, 
  author={Hirofumi Inaguma and Sravya Popuri and Ilia Kulikov and Peng-Jen Chen and Changhan Wang and Yu-An Chung and Yun Tang and Ann Lee and Shinji Watanabe and Juan Pino},
  year={2023},
  eprint={2212.08055},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2302.13971,
  title={LLaMA: Open and Efficient Foundation Language Models},
  author= {Hugo Touvron and Thibaut Lavril and Gautier Izacard and Xavier Martinet and Marie{-}Anne Lachaux and Timoth{\'{e}}e Lacroix and Baptiste Rozi{\`{e}}re and Naman Goyal and Eric Hambro and Faisal Azhar and Aur{\'{e}}lien Rodriguez and Armand Joulin and Edouard Grave and Guillaume Lample},
  year={2023},
  eprint={2302.13971},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arXiv.2305.13245,
  title={GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints}, 
  author={Joshua Ainslie and James Lee-Thorp and Michiel de Jong and Yury Zemlyanskiy and Federico Lebrón and Sumit Sanghai},
  year={2023},
  eprint={2305.13245},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arXiv.2307.09288,
  title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
  author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
  year={2023},
  eprint={2307.09288},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arXiv.2310.06825,
   title={Mistral 7B},
   author={Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Florian Bressand and Gianna Lengyel and Guillaume Lample and Lucile Saulnier and Lélio Renard Lavaud and Marie-Anne Lachaux and Pierre Stock and Teven Le Scao and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
   year={2023},
   eprint={2310.06825},
   archivePrefix={arXiv},
   primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arXiv.2301.08243,
   title={Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture},
   author={Mahmoud Assran and Quentin Duval and Ishan Misra and Piotr Bojanowski and Pascal Vincent and Michael Rabbat and Yann LeCun and Nicolas Ballas},
   year={2023},
   eprint={2301.08243},
   archivePrefix={arXiv},
   primaryClass={cs.CV},
   url={https://arxiv.org/abs/2301.08243},
}

@misc{https://doi.org/10.48550/arXiv.2404.08471,
   title={Revisiting Feature Prediction for Learning Visual Representations from Video},
   author={Adrien Bardes and Quentin Garrido and Jean Ponce and Xinlei Chen and Michael Rabbat and Yann LeCun and Mahmoud Assran and Nicolas Ballas},
   year={2024},
   eprint={2404.08471},
   archivePrefix={arXiv},
   primaryClass={cs.CV},
   url={https://arxiv.org/abs/2404.08471},
}