.01
数据加载:我们从CSV文件中加载客户评论,并进行文本分割。 创建向量存储:使用向量化工具(如Pinecone)对文本进行向量化。 创建Retriever:将向量存储配置为Retriever,可以进行相似度搜索或设置特定参数。
from langchain_community.document_loaders import CSVLoader
from langchain_community.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
# 加载数据
loader = CSVLoader("customer_reviews.csv")
documents = loader.load()
# 文本分割
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
# 向量化
embeddings = OpenAIEmbeddings()
vectorstore = Pinecone.from_documents(texts, embeddings)
# 配置Retriever
retriever = vectorstore.as_retriever()
docs = retriever.invoke("What do customers think about the battery life?")
.03
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
# 创建查询
question = "What features do customers value in smartphones?"
llm = ChatOpenAI(temperature=0)
# 初始化MultiQueryRetriever
retriever_from_llm = MultiQueryRetriever.from_llm(
retriever=vectordb.as_retriever(), llm=llm
)
unique_docs = retriever_from_llm.invoke(question)
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
# 创建查询
question = "What features do customers value in smartphones?"
llm = ChatOpenAI(temperature=0)
# 初始化MultiQueryRetriever
retriever_from_llm = MultiQueryRetriever.from_llm(
retriever=vectordb.as_retriever(), llm=llm
)
unique_docs = retriever_from_llm.invoke(question)
.04
基础检索:先使用基础的向量存储检索器进行初步查询。 压缩检索:结合大型语言模型进一步提取与查询最相关的信息,过滤掉无关部分。
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import OpenAI
# 初始化LLM和压缩器
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
# 配置上下文压缩检索器
compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.invoke("What actions are being proposed to combat climate change?")
.05
from typing import List
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
class CustomRetriever(BaseRetriever):
"""一个简单的检索器,根据查询返回包含关键字的文档。"""
documents: List[Document]
k: int
def _get_relevant_documents(self, query: str) -> List[Document]:
matching_documents = [doc for doc in self.documents if query.lower() in doc.page_content.lower()]
return matching_documents[:self.k]
# 示例
documents = [
Document("Dogs are great companions.", {"type": "dog"}),
Document("Cats are independent pets.", {"type": "cat"}),
]
retriever = CustomRetriever(documents=documents, k=1)
result = retriever.invoke("dog")
print(result[0].page_content) # Output: "Dogs are great companions."
.06
参考:
https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/