import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
# 这里是一篇苏洋博客中养猫的文章(这里只作为学习示例,请勿违规使用)
web_paths=("https://soulteary.com/2018/07/23/experience-in-breeding-cat.html",),
bs_kwargs=dict(
# 过滤 HTML 内容,只解析具有 class 为 post-container 的内容
parse_only=bs4.SoupStrainer(
class_="post-container"
)
),
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # chunk size (characters)
chunk_overlap=200, # chunk overlap (characters)
add_start_index=True, # track index in original document
)
all_splits = text_splitter.split_documents(docs)
print(len(all_splits))
chunk_size
表示多少个字符拆一个chunk
# 对切分后的 chunks进行 Index
_ = vector_store.add_documents(documents=all_splits)
print("document indexed")
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")
example_messages = prompt.invoke(
{"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()
print(example_messages[0].content)
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here)
Context: (context goes here)
Answer:
我们可以定义一次应用程序逻辑并自动支持多种调用模式,包括stream、async和batch调用。
我们可以通过 LangGraph 平台简化部署。
我们只需进行极少的代码更改,就可以轻松地为我们的应用程序添加关键功能,包括持久化和人机交互循环流程。
定义我们application的state 定义我们application的nodes 构建我们application的控制流
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
class State(TypedDict):
question: str
context: List[Document]
answer: str
def retrieve(state: State):
retrieved_docs = vector_store.similarity_search(state["question"])
return {"context": retrieved_docs}
def generate(state: State):
docs_content = "\n\n".join(doc.page_content for doc in state["context"])
messages = prompt.invoke({"question": state["question"], "context": docs_content})
response = llm.invoke(messages)
return {"answer": response.content}
from langgraph.graph import START, StateGraph
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()
response = graph.invoke({"question": "呼吸过快是什么原因?"})
print(response["answer"])
from langchain_core.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)
除了语义搜索之外,我们还可以内置结构化过滤器(例如“查找 2020 年以来的文档。”); 该模型可以将用户查询(可能是多方面的或包含不相关的语言)重写为更有效的搜索查询。
total_documents = len(all_splits)
third = total_documents // 3
for i, document in enumerate(all_splits):
if i < third:
document.metadata["section"] = "beginning"
elif i < 2 * third:
document.metadata["section"] = "middle"
else:
document.metadata["section"] = "end"
# 对切分后的 all_splits 进行 Index
_ = vector_store.add_documents(documents=all_splits)
from typing import Literal
from typing_extensions import Annotated
class Search(TypedDict):
"""Search query."""
query: Annotated[str, ..., "Search query to run."]
section: Annotated[
Literal["beginning", "middle", "end"],
...,
"Section to query.",
]
# 定义应用的状态
class State(TypedDict):
question: str
query: Search
context: List[Document]
answer: str
# 定义应用的执行步骤
def analyze_query(state: State):
# 注意这里使用了with_structured_output生成结构化的输出
structured_llm = llm_model.with_structured_output(Search,method="json_schema")
query = structured_llm.invoke(state["question"])
return {"query": query}
def retrieve(state: State):
query = state["query"]
# 使用 vector_store 根据问题检索相关的文档
retrieved_docs = vector_store.similarity_search(
query["query"],
filter={"section": {"$eq": query["section"]}},
)
return {"context": retrieved_docs}
def generate(state: State):
docs_content = "\n\n".join(doc.page_content for doc in state["context"])
messages = prompt.invoke({"question": state["question"], "context": docs_content})
response = llm_model.invoke(messages)
return {"answer": response.content}
# 编译状态图
# 定义状态图,将analyze_query, retrieve 和 generate 两个步骤添加到状态图中
graph_builder = StateGraph(State).add_sequence([analyze_query,retrieve, generate])
# 并且定义状态图的边,开始节点是analyze_query
graph_builder.add_edge(START, "analyze_query")
# 编译状态图
graph = graph_builder.compile()
总结一下,我们介绍了基于数据构建基本问答应用程序的步骤:
- 使用文档加载器加载数据
- 使用文本分割器对索引数据进行分块,使模型更容易使用
- 嵌入数据并将数据存储在矢量存储中
- 检索先前存储的块以响应传入的问题
- 使用检索到的块作为上下文来生成答案。