LangChain实战：构建AI应用链

引言

LangChain 是构建 LLM 应用的主流框架，提供了从简单的 Prompt 链到复杂的自主 Agent 的全套抽象。本文将系统介绍 LangChain 的核心概念——Chain、Agent、Tool、Memory 和 Output Parser，通过实际代码示例展示如何构建不同类型的 AI 应用。

LangChain 核心架构

graph TB
    A[LangChain 核心组件] --> B[Model I/O]
    A --> C[Retrieval]
    A --> D[Agents]
    A --> E[Chains / LCEL]

    B --> B1[Prompt Templates]
    B --> B2[LLM / Chat Models]
    B --> B3[Output Parsers]

    C --> C1[Document Loaders]
    C --> C2[Text Splitters]
    C --> C3[Vector Stores]
    C --> C4[Retrievers]

    D --> D1[Agent Executor]
    D --> D2[Tools]
    D --> D3[Agent Types]

    E --> E1[RunnableSequence]
    E --> E2[RunnableParallel]
    E --> E3[RunnableBranch]

    style A fill:#1c3d5a,color:#fff
    style B fill:#3498db,color:#fff
    style C fill:#2ecc71,color:#fff
    style D fill:#e74c3c,color:#fff
    style E fill:#f39c12,color:#000

LCEL — LangChain Expression Language

LCEL 是 LangChain 的核心编排方式，使用管道符 | 组合组件：

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Basic chain using LCEL
prompt = ChatPromptTemplate.from_messages([
    ("system", "你是一个专业的{domain}专家，用简洁的中文回答问题。"),
    ("human", "{question}"),
])

model = ChatOpenAI(model="gpt-4o", temperature=0.7)
output_parser = StrOutputParser()

# Pipe operator creates a RunnableSequence
chain = prompt | model | output_parser

# Invoke
result = chain.invoke({
    "domain": "机器学习",
    "question": "什么是梯度下降？"
})
print(result)

# Streaming
for chunk in chain.stream({"domain": "机器学习", "question": "什么是反向传播？"}):
    print(chunk, end="", flush=True)

# Batch processing
results = chain.batch([
    {"domain": "前端", "question": "什么是虚拟DOM？"},
    {"domain": "后端", "question": "什么是微服务？"},
], config={"max_concurrency": 5})

RunnableParallel — 并行执行

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

# Run multiple chains in parallel
analysis_chain = RunnableParallel(
    summary=prompt_summary | model | output_parser,
    keywords=prompt_keywords | model | output_parser,
    sentiment=prompt_sentiment | model | output_parser,
)

result = analysis_chain.invoke({"text": "LangChain是一个优秀的LLM应用框架..."})
# result = {"summary": "...", "keywords": "...", "sentiment": "..."}

RunnableBranch — 条件路由

from langchain_core.runnables import RunnableBranch

# Route to different chains based on input
router = RunnableBranch(
    (lambda x: "代码" in x["question"], code_chain),
    (lambda x: "数学" in x["question"], math_chain),
    general_chain,  # Default fallback
)

result = router.invoke({"question": "写一段Python代码实现快速排序"})

Prompt Templates

from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
    MessagesPlaceholder,
)

# Few-shot prompt template
examples = [
    {"input": "什么是Python？", "output": "Python是一种高级编程语言，以简洁清晰著称。"},
    {"input": "什么是Docker？", "output": "Docker是一个容器化平台，用于打包和运行应用。"},
]

example_prompt = ChatPromptTemplate.from_messages([
    ("human", "{input}"),
    ("ai", "{output}"),
])

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

final_prompt = ChatPromptTemplate.from_messages([
    ("system", "你是一个技术术语解释器，用一句话简洁回答。"),
    few_shot_prompt,
    ("human", "{input}"),
])

chain = final_prompt | model | output_parser
result = chain.invoke({"input": "什么是Kubernetes？"})

Output Parsers — 结构化输出

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

# Define output schema
class CodeReview(BaseModel):
    issues: List[str] = Field(description="发现的代码问题列表")
    severity: str = Field(description="严重程度: low/medium/high")
    suggestions: List[str] = Field(description="改进建议列表")
    score: int = Field(description="代码质量分数 0-100")

parser = JsonOutputParser(pydantic_object=CodeReview)

review_prompt = ChatPromptTemplate.from_messages([
    ("system", "你是一个代码审查专家。分析以下代码并给出结构化的审查结果。\n{format_instructions}"),
    ("human", "请审查这段代码:\n```python\n{code}\n```"),
])

review_chain = review_prompt.partial(
    format_instructions=parser.get_format_instructions()
) | model | parser

result = review_chain.invoke({
    "code": "def add(a,b): return a+b"
})
# result is a dict matching CodeReview schema

使用 with_structured_output

# More robust: use model's native function calling
class ExtractedInfo(BaseModel):
    """Extracted information from the text."""
    name: str = Field(description="人名")
    age: int = Field(description="年龄")
    occupation: str = Field(description="职业")

structured_llm = model.with_structured_output(ExtractedInfo)
result = structured_llm.invoke("张三今年30岁，是一名软件工程师。")
# result = ExtractedInfo(name='张三', age=30, occupation='软件工程师')

Memory Systems

graph LR
    A[Memory Types] --> B[ConversationBufferMemory<br/>完整历史]
    A --> C[ConversationBufferWindowMemory<br/>窗口历史]
    A --> D[ConversationSummaryMemory<br/>摘要历史]
    A --> E[ConversationTokenBufferMemory<br/>Token限制]
    A --> F[VectorStoreMemory<br/>向量存储]

    style B fill:#3498db,color:#fff
    style D fill:#e74c3c,color:#fff
    style F fill:#2ecc71,color:#fff

from langchain_core.prompts import MessagesPlaceholder
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# Chat history store
store = {}

def get_session_history(session_id: str):
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]

# Prompt with message history placeholder
prompt = ChatPromptTemplate.from_messages([
    ("system", "你是一个友好的AI助手。"),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}"),
])

chain = prompt | model | output_parser

# Wrap with message history
chain_with_history = RunnableWithMessageHistory(
    chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="history",
)

# Use with session
config = {"configurable": {"session_id": "user_123"}}

r1 = chain_with_history.invoke({"input": "我叫张三"}, config=config)
r2 = chain_with_history.invoke({"input": "我叫什么名字？"}, config=config)
# r2 will remember: "你叫张三"

Tools — 工具定义与使用

from langchain_core.tools import tool
from langchain_community.tools import DuckDuckGoSearchRun
import httpx
import ast

# Define custom tools using decorator
@tool
def calculate(expression: str) -> str:
    """计算数学表达式。输入应该是一个有效的Python数学表达式。"""
    try:
        # Use ast.literal_eval for safe expression evaluation
        tree = ast.parse(expression, mode='eval')
        result = compile(tree, '<string>', 'eval')
        return str(result)
    except Exception as e:
        return f"计算错误: {e}"

@tool
def get_weather(city: str) -> str:
    """获取指定城市的当前天气信息。"""
    response = httpx.get(
        f"https://api.weatherapi.com/v1/current.json",
        params={"key": "YOUR_API_KEY", "q": city, "lang": "zh"},
    )
    data = response.json()
    current = data["current"]
    return f"{city}: {current['condition']['text']}, 温度{current['temp_c']}°C"

@tool
def search_docs(query: str) -> str:
    """在内部知识库中搜索相关文档。"""
    results = vector_store.similarity_search(query, k=3)
    return "\n".join([doc.page_content for doc in results])

# Built-in tools
search_tool = DuckDuckGoSearchRun()

Agents — 自主决策

from langchain.agents import create_tool_calling_agent, AgentExecutor

# Define tools
tools = [calculate, get_weather, search_docs, search_tool]

# Agent prompt
agent_prompt = ChatPromptTemplate.from_messages([
    ("system", """你是一个智能助手，可以使用以下工具来帮助用户：
- calculate: 进行数学计算
- get_weather: 查询天气
- search_docs: 搜索内部文档
- duckduckgo_search: 搜索互联网

根据用户的问题，决定使用哪些工具，然后综合结果回答。"""),
    MessagesPlaceholder(variable_name="chat_history", optional=True),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])

# Create agent
agent = create_tool_calling_agent(model, tools, agent_prompt)

# Agent executor handles the loop
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,          # Print reasoning steps
    max_iterations=5,      # Prevent infinite loops
    handle_parsing_errors=True,
)

# Run agent
result = agent_executor.invoke({
    "input": "北京今天天气怎么样？另外帮我算一下 (15 * 24) + 360"
})

sequenceDiagram
    participant U as 用户
    participant A as Agent
    participant L as LLM
    participant T as Tools

    U->>A: "北京天气怎么样？算一下15*24+360"
    A->>L: 分析用户意图
    L-->>A: 计划: 1.查天气 2.做计算

    A->>T: get_weather("北京")
    T-->>A: "北京: 晴, 28°C"

    A->>T: calculate("15*24+360")
    T-->>A: "720"

    A->>L: 综合工具结果生成回答
    L-->>A: 最终回答
    A->>U: "北京今天晴天28°C。15*24+360=720"

Retrieval Chains

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Load and index documents
loader = WebBaseLoader("https://docs.example.com/guide")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = FAISS.from_documents(splits, OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Create QA chain
system_prompt = """基于以下上下文回答用户的问题。如果不确定，说"我不知道"。

上下文：
{context}"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Use with history
result = rag_chain.invoke({
    "input": "LangChain支持哪些向量数据库?",
    "chat_history": [],
})

print(result["answer"])
print(result["context"])  # Retrieved source documents

Custom Tools with Pydantic Schema

from langchain_core.tools import StructuredTool
from pydantic import BaseModel, Field

class DatabaseQueryInput(BaseModel):
    table: str = Field(description="要查询的数据表名")
    conditions: str = Field(description="查询条件，如 'age > 25'")
    limit: int = Field(default=10, description="返回结果数量限制")

def query_database(table: str, conditions: str, limit: int = 10) -> str:
    """Query the database with given conditions."""
    # Simulated database query
    query = f"SELECT * FROM {table} WHERE {conditions} LIMIT {limit}"
    return f"Query executed: {query}\nResults: [...]"

db_tool = StructuredTool.from_function(
    func=query_database,
    name="database_query",
    description="查询数据库中的数据",
    args_schema=DatabaseQueryInput,
)

LangSmith 调试与追踪

import os

# Enable LangSmith tracing
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "your-api-key"
os.environ["LANGCHAIN_PROJECT"] = "my-rag-project"

# All chain invocations are now traced
# View traces at https://smith.langchain.com

# Add custom metadata to traces
result = chain.invoke(
    {"question": "什么是LangChain?"},
    config={
        "metadata": {"user_id": "user_123", "session": "abc"},
        "tags": ["production", "rag"],
    },
)

总结

LangChain 通过 LCEL 管道式编排、结构化输出、Memory 系统和 Agent 机制，提供了构建 LLM 应用的完整工具链。核心建议：

优先使用 LCEL 而非 Legacy Chain 类
结构化输出 使用 with_structured_output 比 OutputParser 更可靠
Agent 设计 要控制迭代次数、处理错误、记录日志
开启 LangSmith 追踪，对调试和优化至关重要
关注 LangGraph 作为下一代 Agent 编排框架