RAG 和 RAU:自然语言处理中检索增强语言模型的调查

文摘   2024-11-15 07:41   北京  

大型语言模型(LLMs)推动了自然语言处理(NLP)的显著进步,但它们也面临着幻觉和需要特定领域知识等挑战。

为了减轻这些问题,最近的方法将来自外部资源的信息与LLMs相结合,在 NLP 任务中显著提高了其性能。这篇综述论文解决了检索增强语言模型(RALMs)缺乏全面概述的问题,包括检索增强生成(RAG)和检索增强理解(RAU),深入探讨了它们的范式、演变、分类和应用。论文讨论了 RALMs 的基本组件,包括检索器、语言模型和增强,以及它们的交互如何导致不同的模型结构和应用。

RALMs 在各种任务中显示出实用性,从翻译和对话系统到知识密集型应用。综述包括几种 RALMs 的评估方法,强调在评估中稳健性、准确性和相关性的重要性。 它也承认了 RALMs 的限制,特别是在检索质量和计算效率方面,为未来的研究提供了方向。

总之,这项调查旨在为 RALMs、它们的潜力以及它们在 NLP 未来发展的途径提供一个结构化的见解。

论文还附有一个包含所调查作品和进一步研究资源的 GitHub 仓库:https://github.com/2471023025/RALM_Survey。

我们的这份调查总结了 RALM 的多个方面,包括:定义、检索器、LM、增强、数据源、应用、评估等。

检索增强语言模型(RALM)是通过检索信息来优化语言模型(LM)的输出,以获得用户满意的结果的过程。


槽填充是一种在自然语言处理中使用的技巧,用于识别和从用户提供的文本或语音中提取特定信息。在槽填充中,系统预先定义一组槽位,每个槽位代表一个特定的信息需求。

提高检索质量可以从两个方面考虑:提高用于检索的数据集质量,以及提高检索技术的性能。如今,许多数据集被提供给LLM生成相关内容,而由于LLM本身存在“幻觉”,必须采取某些手段来确保数据的准确性,例如使用人类监督进行细化。

在智能对话任务中,意图识别是一种非常重要的技术,它可以帮助系统理解用户的输入,从而提供更加准确和个性化的回答和服务。

意图识别和槽位填充是对话系统中的基础任务。下面仓库实现了一个基于BERT的意图(intent)和槽位(slots)联合预测模块。

想法上实际与JoinBERT类似,利用 [CLS] token对应的last hidden state去预测整句话的intent,并利用句子tokens的last hidden states做序列标注,找出包含slot values的tokens。

你可以自定义自己的意图和槽位标签,并提供自己的数据,通过下述流程训练自己的模型,并在JointIntentSlotDetector类中加载训练好的模型直接进行意图和槽值预测。 

训练脚本:python train.py

import osimport argparseimport numpy as npfrom sklearn.metrics import accuracy_scorefrom seqeval.metrics import accuracy_score
import torchfrom torch.utils.data import DataLoader
from transformers import BertTokenizerfrom transformers import get_linear_schedule_with_warmup
from datasets import IntentSlotDatasetfrom models import JointBertfrom tools import save_module, split_data

def dev(model, val_dataloader, device, slot_dict): model.eval() intent_acc, slot_acc = 0, 0 all_true_intent, all_pred_intent = [], [] with torch.no_grad(): for step, batch in enumerate(val_dataloader): input_ids, intent_labels, slot_labels = batch
outputs = model( input_ids=torch.tensor(input_ids).long().to(device), intent_labels=torch.tensor(intent_labels).long().to(device), slot_labels=torch.tensor(slot_labels).long().to(device) )
intent_probs = torch.softmax(outputs["intent_logits"], dim=-1).detach().cpu().numpy() slot_probs = torch.softmax(outputs["slot_logits"], dim=-1).detach().cpu().numpy() slot_ids = np.argmax(slot_probs, axis=-1) intent_ids = np.argmax(intent_probs, axis=-1) slot_ids = slot_ids.tolist() intent_ids = intent_ids.tolist()
slot_ids = [[slot_dict[i] for i in line] for line in slot_ids] slot_labels = [[slot_dict[i] for i in line] for line in slot_labels]
all_true_intent.extend(intent_labels) all_pred_intent.extend(intent_ids)
intent_acc += accuracy_score(intent_labels, intent_ids) slot_acc += accuracy_score(slot_labels, slot_ids)
intent_avg, slot_avg = intent_acc / len(val_dataloader), slot_acc / len(val_dataloader) dev_acc = intent_avg + slot_avg return dev_acc, intent_avg, slot_avg

def train(args): # 模型保存位置 model_save_dir = args.save_dir + "/" + args.model_path.split("/")[-1]
with open(args.slot_label_path, 'r') as f: slot_labels = f.read().strip('\n').split('\n') slot_dict = dict(zip(range(len(slot_labels)), slot_labels))
# -----------set cuda environment------------- os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_devices device = "cuda" if torch.cuda.is_available() else "cpu"
# -----------load tokenizer----------- tokenizer = BertTokenizer.from_pretrained(args.model_path) save_module(tokenizer, model_save_dir)
# -----------load data----------------- train_data, val_data = split_data(args.train_data_path, args.train_val_data_split)
train_dataset = IntentSlotDataset.load_from_path( data_content=train_data, intent_label_path=args.intent_label_path, slot_label_path=args.slot_label_path, tokenizer=tokenizer )
val_dataset = IntentSlotDataset.load_from_path( data_content=val_data, intent_label_path=args.intent_label_path, slot_label_path=args.slot_label_path, tokenizer=tokenizer )
# -----------load model and dataset----------- model = JointBert.from_pretrained( args.model_path, slot_label_num=train_dataset.slot_label_num, intent_label_num=train_dataset.intent_label_num ) model = model.to(device).train()
train_dataloader = DataLoader( train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=train_dataset.batch_collate_fn)
val_dataloader = DataLoader( val_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=val_dataset.batch_collate_fn)
# -----------calculate training steps----------- if args.max_training_steps > 0: total_steps = args.max_training_steps else: total_steps = len(train_dataset) * args.train_epochs // args.gradient_accumulation_steps // args.batch_size
print('calculated total optimizer update steps : {}'.format(total_steps))
# -----------prepare optimizer and schedule------------ parameter_names_no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ # 这些参数会被应用常规的权重衰减(由 args.weight_decay 指定) {'params': [ para for para_name, para in model.named_parameters() if not any(nd_name in para_name for nd_name in parameter_names_no_decay) ], 'weight_decay': args.weight_decay}, # 这些参数的权重衰减被设置为0 {'params': [ para for para_name, para in model.named_parameters() if any(nd_name in para_name for nd_name in parameter_names_no_decay) ], 'weight_decay': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # 学习率变化(更新)方式 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=total_steps)
# -----------training------------- max_acc = 0 for epoch in range(args.train_epochs): total_loss = 0 model.train() for step, batch in enumerate(train_dataloader): input_ids, intent_labels, slot_labels = batch
outputs = model( input_ids=torch.tensor(input_ids).long().to(device), intent_labels=torch.tensor(intent_labels).long().to(device), slot_labels=torch.tensor(slot_labels).long().to(device) )
loss = outputs['loss'] total_loss += loss.item()
if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps
loss.backward()
if step % args.gradient_accumulation_steps == 0: # 用于对梯度进行裁剪,以防止在神经网络训练过程中出现梯度爆炸的问题。 torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
optimizer.step() scheduler.step() model.zero_grad()
train_loss = total_loss / len(train_dataloader)
dev_acc, intent_avg, slot_avg = dev(model, val_dataloader, device, slot_dict)
flag = False if max_acc < dev_acc: max_acc = dev_acc flag = True save_module(model, model_save_dir) print(f"[{epoch}/{args.train_epochs}] train loss: {train_loss} dev intent_avg: {intent_avg} " f"def slot_avg: {slot_avg} save best model: {'*' if flag else ''}")
dev_acc, intent_avg, slot_avg = dev(model, val_dataloader, device, slot_dict) print("last model dev intent_avg: {} def slot_avg: {}".format(intent_avg, slot_avg)) print("模型保存位置:" + model_save_dir)

if __name__ == '__main__': parser = argparse.ArgumentParser()
# environment parameters parser.add_argument("--cuda_devices", type=str, default='0', help='set cuda device numbers')
# model parameters parser.add_argument("--model_path", type=str, default='./bert-base-chinese', help="pretrained model loading path")
# data parameters parser.add_argument("--train_data_path", type=str, default='data/SMP2019/data.json', help="training data path") parser.add_argument("--train_val_data_split", type=float, default=0.8, help="training data and val data split rate") parser.add_argument("--slot_label_path", type=str, default='data/SMP2019/slot_labels.txt', help="slot label path") parser.add_argument("--intent_label_path", type=str, default='data/SMP2019/intent_labels.txt', help="intent label path")
# training parameters parser.add_argument("--save_dir", type=str, default='./save_model', help="directory to save the model") parser.add_argument("--max_training_steps", type=int, default=0, help='max training step for optimizer(优化器的最大训练步数), if larger than 0') parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="number of updates steps to accumulate before performing a backward() pass.(执行 backward() 之前累积的更新步数数量)")
parser.add_argument("--batch_size", type=int, default=32, help='training data batch size') parser.add_argument("--train_epochs", type=int, default=20, help='training epoch number')
parser.add_argument("--learning_rate", type=float, default=5e-5, help='learning rate') parser.add_argument("--adam_epsilon", type=float, default=1e-8, help="epsilon for Adam optimizer") parser.add_argument("--warmup_steps", type=int, default=0, help="warmup step number") parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay rate") parser.add_argument("--max_grad_norm", type=float, default=1.0, help="maximum norm for gradients")
args = parser.parse_args()
train(args)

推理脚本:python predict.py

from detector import JointIntentSlotDetectorimport time
start1_time = time.perf_counter()model = JointIntentSlotDetector.from_pretrained( model_path='./save_model/bert-base-chinese', tokenizer_path='./save_model/bert-base-chinese', intent_label_path='./data/SMP2019/intent_labels.txt', slot_label_path='./data/SMP2019/slot_labels.txt')start2_time = time.perf_counter()all_text = ['定位我现在的位置', "现在几点了", "2013年亚洲冠军联赛恒广州恒大比赛时间。", "帮我查一下赣州到厦门的汽车", "导航到望江西路上去", "把张玉娟的手机号码发送给吴伟", "打电话给xxx", "经XXX的电话号码发给lc" "发信息给盛吉", "将你在哪发送给纲吉", "发信息给老妈说我在吃饭", "我要听稻香", "访问浏览器", "中国制用英文怎么说"]for i in all_text: print(model.detect(i))end_time = time.perf_counter()time1 = (end_time - start1_time) / 3600time2 = (end_time - start2_time) / 3600print("所有检测时间(包括加载模型):", time1, "s", "除去模型加载时间:", time2, "s", "总预测数据量:", len(all_text), "平均预测一条的时间(除去加载模型):", time2 / len(all_text), "s/条")

源码:https://github.com/mzc421/NLP/tree/main/bert-intent-slot

AI技术研习社
专注分享人工智能、大模型、算法、大数据开发、数据分析领域的技术干货和落地实践!
 最新文章