@inproceedings{hu2021lora, title={LoRA: Low-Rank Adaptation of Large Language Models}, author={Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu}, booktitle={International Conference on Learning Representations (ICLR)}, year={2022}, url={https://arxiv.org/abs/2106.09685} } @article{mangrulkar2022peft, title={PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods}, author={Mangrulkar, Sourab and Gugger, Sylvain and Debut, Lysandre and Belkada, Younes and Paul, Sayak and Bossan, Benjamin}, journal={HuggingFace Blog}, year={2022}, url={https://github.com/huggingface/peft} } @book{golub1971singular, title={Singular Value Decomposition and Least Squares Solutions}, author={Golub, Gene H. and Reinsch, Christian}, year={1971}, publisher={Springer}, address={Berlin, Heidelberg}, doi={10.1007/BF02163027} } @article{eckart1936approximation, title={The Approximation of One Matrix by Another of Lower Rank}, author={Eckart, Carl and Young, Gale}, journal={Psychometrika}, volume={1}, number={3}, pages={211--218}, year={1936}, publisher={Springer}, doi={10.1007/BF02288367} } @inproceedings{shazeer2017outrageously, title={Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer}, author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc V. and Hinton, Geoffrey E. and Dean, Jeff}, booktitle={International Conference on Learning Representations (ICLR)}, year={2017}, url={https://arxiv.org/abs/1701.06538} } @inproceedings{zhang2022moefication, title={MoEfication: Transformer Feed-forward Layers are Mixtures of Experts}, author={Zhang, Zhengyan and Lin, Yankai and Liu, Zhiyuan and Li, Peng and Sun, Maosong and Zhou, Jie}, booktitle={Findings of the Association for Computational Linguistics: ACL 2022}, year={2022}, pages={877--890}, doi={10.18653/v1/2022.findings-acl.71} } @article{yuan2023asvd, title={ASVD: Activation-aware Singular Value Decomposition for Compressing Large Language Models}, author={Yuan, Zhihang and Shang, Yuzhang and Song, Yitong and Wu, Wenqi and Yan, Yan and Xia, Guisong}, journal={arXiv preprint arXiv:2312.05821}, year={2023}, url={https://arxiv.org/abs/2312.05821} } @article{sharma2023truth, title={The Truth is in There: Improving Reasoning in Language Models with Layer-Selective Rank Reduction}, author={Sharma, Pratyusha and Ash, Jordan T. and Misra, Dipendra}, journal={arXiv preprint arXiv:2312.13558}, year={2023}, url={https://arxiv.org/abs/2312.13558} } @article{ilharco2022editing, title={Editing Models with Task Arithmetic}, author={Ilharco, Gabriel and Ribeiro, Marco Tulio and Wortsman, Mitchell and Gururangan, Suchin and Schmidt, Ludwig and Hajishirzi, Hannaneh and Farhadi, Ali}, journal={arXiv preprint arXiv:2212.04089}, year={2022}, url={https://arxiv.org/abs/2212.04089} } @article{zou2023representation, title={Representation Engineering: A Top-Down Approach to AI Transparency}, author={Zou, Andy and Phan, Long and Chen, Sarah and Campbell, James and Guo, Phillip and Ren, Richard and Pan, Alexander and Yin, Xuwang and Mazeika, Mantas and Dombrowski, Ann-Kathrin and Goel, Shashwat and Li, Nathaniel and Byun, Michael J. and Wang, Zifan and Mallen, Alex and Basart, Steven and Koyejo, Sanmi and Song, Dawn and Fredrikson, Matt and Kolter, J. Zico and Hendrycks, Dan}, journal={arXiv preprint arXiv:2310.01405}, year={2023}, url={https://arxiv.org/abs/2310.01405} } @article{turner2023activation, title={Activation Addition: Steering Language Models Without Optimization}, author={Turner, Alexander and Thiergart, Lisa and Udell, David and Leech, Gavin and Mini, Ulisse and MacDiarmid, Monte}, journal={arXiv preprint arXiv:2308.10248}, year={2023}, url={https://arxiv.org/abs/2308.10248} } @article{halko2011finding, title={Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions}, author={Halko, Nathan and Martinsson, Per-Gunnar and Tropp, Joel A.}, journal={SIAM Review}, volume={53}, number={2}, pages={217--288}, year={2011}, doi={10.1137/090771806} } @inproceedings{lepage2023safetensors, title={Safetensors: Safe, Simple, and Fast Tensor Storage}, author={LePage, Nicolas and von Platen, Patrick and Chaumond, Julien and Mangrulkar, Sourab}, booktitle={NeurIPS 2023 Workshop on Machine Learning and Compression}, year={2023}, url={https://github.com/huggingface/safetensors} } @misc{qwen2024qwen3, title={Qwen3 Technical Report}, author={Yang, An and Yang, Baosong and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Li, Chengyuan and Liu, Dayiheng and Huang, Fei and Wei, Haoran and Lin, Huan and others}, year={2024}, howpublished={arXiv preprint}, url={https://arxiv.org/abs/2405.11007} } @software{llamacpp, title={llama.cpp: LLM Inference in C/C++}, author={Gerganov, Georgi}, year={2023}, url={https://github.com/ggerganov/llama.cpp} } @article{cao2023dslr, title={DSLR: Document Refinement with Sentence-Level Re-ranking and Reconstruction to Enhance Retrieval-Augmented Generation}, author={Cao, He and Yan, Zijing and Feng, Hao and Khoussainov, Bakh and Dang, Jian and Li, Xiuying}, journal={arXiv preprint arXiv:2310.13427}, year={2023}, url={https://arxiv.org/abs/2310.13427} } @misc{docker_overlayfs, title={Docker Storage Drivers: OverlayFS}, author={Docker Inc.}, year={2024}, howpublished={\url{https://docs.docker.com/storage/storagedriver/overlayfs-driver/}}, note={Accessed: 2024-2026} }