{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":683573931,"defaultBranch":"main","name":"Awesome-LLM-Inference","ownerLogin":"DefTruth","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-08-27T02:32:15.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/31974251?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"","listCacheKey":"v0:1717233877.0","currentOid":""},"activityList":{"items":[{"before":"49ade184939e4fe62bddf2d7275dc71d6c7cd46a","after":"71977627283fda306696e39738fa9ee93de37451","ref":"refs/heads/main","pushedAt":"2024-06-05T01:45:17.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯[I-LLM] I-LLM: Efficient Integer-Only Inference for Fully-Quantized Low-Bit Large Language Models(@Houmo AI)","shortMessageHtmlLink":"πŸ”₯[I-LLM] I-LLM: Efficient Integer-Only Inference for Fully-Quantized …"}},{"before":"14576cae786bedfe0bc0e975c3352de0137cc23d","after":"49ade184939e4fe62bddf2d7275dc71d6c7cd46a","ref":"refs/heads/main","pushedAt":"2024-06-03T01:44:34.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[DeFT] DeFT: Decoding with Flash Tree-Attention for Efficient Tree-structured LLM Inference(@Westlake University etc)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[DeFT] DeFT: Decoding with Flash Tree-Attention for Efficient Tree-…"}},{"before":"296260510379cc95a6f302b80177b52ac2c4c308","after":"14576cae786bedfe0bc0e975c3352de0137cc23d","ref":"refs/heads/main","pushedAt":"2024-06-01T09:25:00.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"422672eb0ee8b925e3bfbbdff1708a035ee26e0d","after":"296260510379cc95a6f302b80177b52ac2c4c308","ref":"refs/heads/main","pushedAt":"2024-06-01T09:20:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"25fef5ccd1af170a22c1a339e89e8c8b7498f4c3","after":"422672eb0ee8b925e3bfbbdff1708a035ee26e0d","ref":"refs/heads/main","pushedAt":"2024-06-01T08:47:39.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Add many LLM Inference papers","shortMessageHtmlLink":"Add many LLM Inference papers"}},{"before":"ab989bab9b39ffc3deb12b4ddb5e5611dc492441","after":"25fef5ccd1af170a22c1a339e89e8c8b7498f4c3","ref":"refs/heads/main","pushedAt":"2024-05-30T02:09:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯[Instructive Decoding] INSTRUCTIVE DECODING: INSTRUCTION-TUNED LARGE LANGUAGE MODELS ARE SELF-REFINER FROM NOISY INSTRUCTIONS(@KAIST AI)","shortMessageHtmlLink":"πŸ”₯[Instructive Decoding] INSTRUCTIVE DECODING: INSTRUCTION-TUNED LARGE…"}},{"before":"790d27f317f68b1d953c89cc8779cf094a66c540","after":"ab989bab9b39ffc3deb12b4ddb5e5611dc492441","ref":"refs/heads/main","pushedAt":"2024-05-27T02:19:00.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯[ZipCache] ZipCache: Accurate and Efficient KV Cache Quantization with Salient Token Identification(@Zhejiang University etc)","shortMessageHtmlLink":"πŸ”₯[ZipCache] ZipCache: Accurate and Efficient KV Cache Quantization wi…"}},{"before":"9a4e04d04bb74c4a13cc3ff7a3ced8a9c7f4f7b7","after":"790d27f317f68b1d953c89cc8779cf094a66c540","ref":"refs/heads/main","pushedAt":"2024-05-27T02:15:20.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"32680bee00de2a59811091648b9a6dcf63103ddd","after":"9a4e04d04bb74c4a13cc3ff7a3ced8a9c7f4f7b7","ref":"refs/heads/main","pushedAt":"2024-05-25T06:13:55.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"52f1dbcc2adc5246208c46ef77d91d9111da1c0a","after":"32680bee00de2a59811091648b9a6dcf63103ddd","ref":"refs/heads/main","pushedAt":"2024-05-25T05:58:05.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"271bd694f8a836d8c2cbffed8346a069247d2770","after":"52f1dbcc2adc5246208c46ef77d91d9111da1c0a","ref":"refs/heads/main","pushedAt":"2024-05-20T09:10:13.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Merge pull request #16 from KylinC/main\n\nupdate [Decoding Speculative Decoding] github repo","shortMessageHtmlLink":"Merge pull request #16 from KylinC/main"}},{"before":"a31b3c7f6a06ced107d6bcf0895d88b829292067","after":"271bd694f8a836d8c2cbffed8346a069247d2770","ref":"refs/heads/main","pushedAt":"2024-05-20T06:09:23.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[SQKV] SKVQ: Sliding-window Key and Value Cache Quantization for Large Language Models(@Shanghai AI Laboratory)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[SQKV] SKVQ: Sliding-window Key and Value Cache Quantization for La…"}},{"before":"f6a6c0dc57b882e7ef62820e993341a08a3a606f","after":"a31b3c7f6a06ced107d6bcf0895d88b829292067","ref":"refs/heads/main","pushedAt":"2024-05-15T03:52:08.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[YOCO] You Only Cache Once: Decoder-Decoder Architectures for Language Models(@Microsoft)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[YOCO] You Only Cache Once: Decoder-Decoder Architectures for Langu…"}},{"before":"dc32f2c9c9beaa72af633b2e6e44a7bdcfe144ec","after":"f6a6c0dc57b882e7ef62820e993341a08a3a606f","ref":"refs/heads/main","pushedAt":"2024-05-13T01:18:14.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update Trending LLM/LMM Topics","shortMessageHtmlLink":"Update Trending LLM/LMM Topics"}},{"before":"876ccc45f5045403e79796229f5e60b792b3900c","after":"dc32f2c9c9beaa72af633b2e6e44a7bdcfe144ec","ref":"refs/heads/main","pushedAt":"2024-05-12T12:29:59.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯πŸ”₯[DeepSeek-V2] DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model(@DeepSeek-AI)","shortMessageHtmlLink":"πŸ”₯πŸ”₯πŸ”₯[DeepSeek-V2] DeepSeek-V2: A Strong, Economical, and Efficient Mix…"}},{"before":"86e661290bd1ef269334b0272a9e6eb07319c5f2","after":"876ccc45f5045403e79796229f5e60b792b3900c","ref":"refs/heads/main","pushedAt":"2024-05-12T02:50:50.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[YOCO] You Only Cache Once: Decoder-Decoder Architectures for Language Models(@Microsoft)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[YOCO] You Only Cache Once: Decoder-Decoder Architectures for Langu…"}},{"before":"d1da65db2cc8e893bd4094d9acf905ad248e2d55","after":"86e661290bd1ef269334b0272a9e6eb07319c5f2","ref":"refs/heads/main","pushedAt":"2024-05-10T09:47:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"[KV-Runahead] KV-Runahead: Scalable Causal LLM Inference by Parallel Key-Value Cache Generation(@Apple etc)","shortMessageHtmlLink":"[KV-Runahead] KV-Runahead: Scalable Causal LLM Inference by Parallel …"}},{"before":"27d4d89e73fdac62ab2bc571642f86b1b9f353b3","after":"d1da65db2cc8e893bd4094d9acf905ad248e2d55","ref":"refs/heads/main","pushedAt":"2024-05-09T02:09:56.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"9e18cb4468850ca9c9b0d098b0e4d790b2357a8a","after":"27d4d89e73fdac62ab2bc571642f86b1b9f353b3","ref":"refs/heads/main","pushedAt":"2024-05-09T02:06:19.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"43c5a728191972dacc31de419f1a55a4aa3ed90c","after":"9e18cb4468850ca9c9b0d098b0e4d790b2357a8a","ref":"refs/heads/main","pushedAt":"2024-05-09T02:05:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"[KVCache-1Bit] KV Cache is 1 Bit Per Channel: Efficient Large Language Model Inference with Coupled Quantization(@Rice University)","shortMessageHtmlLink":"[KVCache-1Bit] KV Cache is 1 Bit Per Channel: Efficient Large Languag…"}},{"before":"217ab533ea860949e8989c741e836e50b404bba3","after":"43c5a728191972dacc31de419f1a55a4aa3ed90c","ref":"refs/heads/main","pushedAt":"2024-05-09T01:56:15.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯[vAttention] vAttention: Dynamic Memory Management for Serving LLMs without PagedAttention(@Microsoft Research India)","shortMessageHtmlLink":"πŸ”₯[vAttention] vAttention: Dynamic Memory Management for Serving LLMs …"}},{"before":"80200153765f8e70fb09865f37b275899d2201b1","after":"217ab533ea860949e8989c741e836e50b404bba3","ref":"refs/heads/main","pushedAt":"2024-05-09T01:52:49.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[W4A8KV4] QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving(@MIT&NVIDIA)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[W4A8KV4] QServe: W4A8KV4 Quantization and System Co-design for Eff…"}},{"before":"0b2a7c6300c8d1ec17feaafe54af0ba9c981a75d","after":"80200153765f8e70fb09865f37b275899d2201b1","ref":"refs/heads/main","pushedAt":"2024-05-02T06:25:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[ChunkAttention] ChunkAttention: Efficient Self-Attention with Prefix-Aware KV Cache and Two-Phase Partition(@microsoft.com)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[ChunkAttention] ChunkAttention: Efficient Self-Attention with Pref…"}},{"before":"506f38a0169d80688a41c7cfef9f789af19640e8","after":"0b2a7c6300c8d1ec17feaafe54af0ba9c981a75d","ref":"refs/heads/main","pushedAt":"2024-05-01T06:39:19.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[KCache] EFFICIENT LLM INFERENCE WITH KCACHE(@Qiaozhi He, Zhihua Wu)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[KCache] EFFICIENT LLM INFERENCE WITH KCACHE(@qiaozhi He, Zhihua Wu)"}},{"before":"754ec24ae385971f7a036cfaf7b7f33754f265fd","after":"506f38a0169d80688a41c7cfef9f789af19640e8","ref":"refs/heads/main","pushedAt":"2024-04-28T03:23:43.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"0d742612afb92bc685162837efd859093022c67c","after":"754ec24ae385971f7a036cfaf7b7f33754f265fd","ref":"refs/heads/main","pushedAt":"2024-04-27T06:37:14.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"4ddc18ae2611679ecbfdd4beef3a9a617a9904d9","after":"0d742612afb92bc685162837efd859093022c67c","ref":"refs/heads/main","pushedAt":"2024-04-27T06:36:45.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"1c728c9f1c2dd26fa1d74080cf2c67b4155096c0","after":"4ddc18ae2611679ecbfdd4beef3a9a617a9904d9","ref":"refs/heads/main","pushedAt":"2024-04-27T06:31:55.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"πŸ”₯πŸ”₯[RadixAttention] Efficiently Programming Large Language Models using SGLang(@Stanford University etc)","shortMessageHtmlLink":"πŸ”₯πŸ”₯[RadixAttention] Efficiently Programming Large Language Models usin…"}},{"before":"a699eaf3c3857307dc311af0c8748b331965b0b5","after":"1c728c9f1c2dd26fa1d74080cf2c67b4155096c0","ref":"refs/heads/main","pushedAt":"2024-04-27T01:30:53.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Merge pull request #15 from preminstrel/patch-1\n\nUpdate README.md","shortMessageHtmlLink":"Merge pull request #15 from preminstrel/patch-1"}},{"before":"7c0c1a2422b0d2e485b3a0f5f70c6a8d39342e0c","after":"a699eaf3c3857307dc311af0c8748b331965b0b5","ref":"refs/heads/main","pushedAt":"2024-04-26T08:15:32.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"DefTruth","name":"DefTruth","path":"/DefTruth","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/31974251?s=80&v=4"},"commit":{"message":"Merge pull request #13 from HarryWu-CHN/kvcache-add\n\n[KVcache] add \"Gear\" paper and code of \"Keyformer\"","shortMessageHtmlLink":"Merge pull request #13 from HarryWu-CHN/kvcache-add"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAEXJuVhQA","startCursor":null,"endCursor":null}},"title":"Activity Β· DefTruth/Awesome-LLM-Inference"}