微网站需要什么技术,企业网站的seo,wordpress页面模板制作,东莞网站建设seo优化前言
最近#xff0c;开源了可商用的llama2#xff0c;支持长度相比llama1的1024#xff0c;拓展到了4096长度#xff0c;然而#xff0c;相比GPT-4、Claude-2等支持的长度#xff0c;llama的长度外推显得尤为重要#xff0c;本文记录了三种网络开源的RoPE改进方式及相…前言
最近开源了可商用的llama2支持长度相比llama1的1024拓展到了4096长度然而相比GPT-4、Claude-2等支持的长度llama的长度外推显得尤为重要本文记录了三种网络开源的RoPE改进方式及相关源码的阅读。
关于长度外推性https://kexue.fm/archives/9431
关于RoPEhttps://kexue.fm/archives/8265
1、线性插值法
论文EXTENDING CONTEXT WINDOW OF LARGE LANGUAGE MODELS VIA POSITION INTERPOLATION
链接https://arxiv.org/pdf/2306.15595.pdf
思想不进行长度外推而是直接缩小位置索引。即将4096的位置编码通过线性插值法压缩到2048内这样只需在少量的4096长度的数据上继续预训练便可达到不错的效果。 源码阅读附注释
class LlamaLinearScaledRotaryEmbedding(torch.nn.Module):def __init__(self, dim, max_position_embeddings2048, base10000, scale1, deviceNone):super().__init__()# 相比RoPE增加scale参数self.scale scale# inv_freq为基值向量inv_freq 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))self.register_buffer(inv_freq, inv_freq)# Build here to make torch.jit.trace work.self.max_seq_len_cached max_position_embeddings# 构建max_seq_len_cached大小的张量tt torch.arange(self.max_seq_len_cached, deviceself.inv_freq.device, dtypeself.inv_freq.dtype)# 张量t归一化RoPE没有这一步t / self.scale# einsum计算频率矩阵# i, j-i j’表示分别输入尺寸为[i]、[j]的向量做笛卡尔运算得到尺寸为[i, j]的矩阵。freqs torch.einsum(i,j-ij, t, self.inv_freq)# Different from paper, but it uses a different permutation in order to obtain the same calculation# 在-1维做一次拷贝、拼接emb torch.cat((freqs, freqs), dim-1)dtype torch.get_default_dtype()# 注册为模型的缓冲区cos_cached和sin_cachedself.register_buffer(cos_cached, emb.cos()[None, None, :, :].to(dtype), persistentFalse)self.register_buffer(sin_cached, emb.sin()[None, None, :, :].to(dtype), persistentFalse)def forward(self, x, seq_lenNone):# x: [bs, num_attention_heads, seq_len, head_size]# This if block is unlikely to be run after we build sin/cos in __init__. Keep the logic here just in case.# seq_len为序列长度seq_len大于max_seq_len_cached则重新计算频率矩阵并更新cos_cached和sin_cached的缓冲区if seq_len self.max_seq_len_cached:self.max_seq_len_cached seq_lent torch.arange(self.max_seq_len_cached, devicex.device, dtypeself.inv_freq.dtype)t / self.scalefreqs torch.einsum(i,j-ij, t, self.inv_freq)# Different from paper, but it uses a different permutation in order to obtain the same calculationemb torch.cat((freqs, freqs), dim-1).to(x.device)self.register_buffer(cos_cached, emb.cos()[None, None, :, :].to(x.dtype), persistentFalse)self.register_buffer(sin_cached, emb.sin()[None, None, :, :].to(x.dtype), persistentFalse)# 长度裁剪返回cos_cached和sin_cached中与seq_len序列长度return (self.cos_cached[:, :, :seq_len, ...].to(dtypex.dtype),self.sin_cached[:, :, :seq_len, ...].to(dtypex.dtype),)
线性插值法的相关实验效果https://lmsys.org/blog/2023-06-29-longchat/
2、NTK插值法
NTK插值改进llama中使用的RoPE插值方法同样对于RoPE代码改动更小其他地方与线性插值法实现一致。
reddit原帖NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k) context size without any fine-tuning and minimal perplexity degradation
链接https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/?rdt58346
源码阅读
class LlamaNTKScaledRotaryEmbedding(torch.nn.Module):def __init__(self, dim, max_position_embeddings2048, base10000, alpha1, deviceNone):super().__init__()# 与线性插值法相比实现更简单alpha仅用来改变basebase base * alpha ** (dim / (dim-2))inv_freq 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))self.register_buffer(inv_freq, inv_freq)# Build here to make torch.jit.trace work.self.max_seq_len_cached max_position_embeddingst torch.arange(self.max_seq_len_cached, deviceself.inv_freq.device, dtypeself.inv_freq.dtype)freqs torch.einsum(i,j-ij, t, self.inv_freq)# Different from paper, but it uses a different permutation in order to obtain the same calculationemb torch.cat((freqs, freqs), dim-1)dtype torch.get_default_dtype()self.register_buffer(cos_cached, emb.cos()[None, None, :, :].to(dtype), persistentFalse)self.register_buffer(sin_cached, emb.sin()[None, None, :, :].to(dtype), persistentFalse)def forward(self, x, seq_lenNone):# x: [bs, num_attention_heads, seq_len, head_size]# This if block is unlikely to be run after we build sin/cos in __init__. Keep the logic here just in case.if seq_len self.max_seq_len_cached:self.max_seq_len_cached seq_lent torch.arange(self.max_seq_len_cached, devicex.device, dtypeself.inv_freq.dtype)freqs torch.einsum(i,j-ij, t, self.inv_freq)# Different from paper, but it uses a different permutation in order to obtain the same calculationemb torch.cat((freqs, freqs), dim-1).to(x.device)self.register_buffer(cos_cached, emb.cos()[None, None, :, :].to(x.dtype), persistentFalse)self.register_buffer(sin_cached, emb.sin()[None, None, :, :].to(x.dtype), persistentFalse)return (self.cos_cached[:, :, :seq_len, ...].to(dtypex.dtype),self.sin_cached[:, :, :seq_len, ...].to(dtypex.dtype),)3、动态插值法
动态插值法又是对NTK插值法和线性插值法的改进可以看作是上述两者的一种结合思想旨在减少困惑度损失并实现更大的缩放。
reddit原帖Dynamically Scaled RoPE further increases performance of long context LLaMA with zero fine-tuning
链接https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/
源码阅读
class LlamaDynamicScaledRotaryEmbedding(torch.nn.Module):def __init__(self, dim, max_position_embeddings2048, base10000, ntkFalse, deviceNone):super().__init__()# 是否开启NTKNeural Tangent Kernelself.ntk ntkself.base baseself.dim dimself.max_position_embeddings max_position_embeddings# inv_freq为基值向量inv_freq 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))self.register_buffer(inv_freq, inv_freq)# Build here to make torch.jit.trace work.self.max_seq_len_cached max_position_embeddingst torch.arange(self.max_seq_len_cached, deviceself.inv_freq.device, dtypeself.inv_freq.dtype)freqs torch.einsum(i,j-ij, t, self.inv_freq)# Different from paper, but it uses a different permutation in order to obtain the same calculation# emb[max_seq_len_cached, dim]emb torch.cat((freqs, freqs), dim-1)dtype torch.get_default_dtype()self.register_buffer(cos_cached, emb.cos()[None, None, :, :].to(dtype), persistentFalse)self.register_buffer(sin_cached, emb.sin()[None, None, :, :].to(dtype), persistentFalse)def forward(self, x, seq_lenNone):# x: [bs, num_attention_heads, seq_len, head_size]# This if block is unlikely to be run after we build sin/cos in __init__. Keep the logic here just in case.if seq_len self.max_seq_len_cached:self.max_seq_len_cached seq_lenif self.ntk:base self.base * ((self.ntk * seq_len / self.max_position_embeddings) - (self.ntk - 1)) ** (self.dim / (self.dim-2))# 计算新的inv_freqinv_freq 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))self.register_buffer(inv_freq, inv_freq)t torch.arange(self.max_seq_len_cached, devicex.device, dtypeself.inv_freq.dtype)if not self.ntk:# 缩放t * self.max_position_embeddings / seq_len# 得到新的频率矩阵freqsfreqs torch.einsum(i,j-ij, t, self.inv_freq)# Different from paper, but it uses a different permutation in order to obtain the same calculation# freqs与自身拼接得到新的embemb torch.cat((freqs, freqs), dim-1).to(x.device)# 注册为模型的缓冲区cos_cached和sin_cachedself.register_buffer(cos_cached, emb.cos()[None, None, :, :].to(x.dtype), persistentFalse)self.register_buffer(sin_cached, emb.sin()[None, None, :, :].to(x.dtype), persistentFalse)# 长度裁剪return (self.cos_cached[:, :, :seq_len, ...].to(dtypex.dtype),self.sin_cached[:, :, :seq_len, ...].to(dtypex.dtype),)网友对于困惑度的实验并取得了一定的效果https://github.com/turboderp/exllama/pull/118
总结
本文介绍了llama通过线性插值法及相关改进方案进行长度外推的trcik并对相关源码阅读及网络资源进行记录个人粗浅认为相比LongLLaMA基于线性插值法Finetune的方式是一种高性价比的长度外推方案。