结构化输出(Structured Outputs)
结构化输出是大语言模型的一项重要功能,允许模型生成符合特定格式和模式的结构化数据,如 JSON、XML 等格式,便于程序化处理和集成。
概述
结构化输出的优势:
- 数据格式一致性
- 易于程序解析
- 减少后处理工作
- 提高数据质量
- 支持复杂嵌套结构
支持的格式
JSON 格式
- 对象和数组
- 基本数据类型
- 嵌套结构
- 枚举值
其他格式
- XML
- YAML
- CSV
- 自定义格式
基本用法
JSON 模式定义
python
import openai
import json
client = openai.OpenAI(
api_key="your_api_key",
base_url="https://realmrouter.cn/v1"
)
# 定义 JSON 模式
json_schema = {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "用户姓名"
},
"age": {
"type": "integer",
"description": "用户年龄",
"minimum": 0,
"maximum": 150
},
"email": {
"type": "string",
"format": "email",
"description": "邮箱地址"
},
"skills": {
"type": "array",
"items": {
"type": "string"
},
"description": "技能列表"
},
"address": {
"type": "object",
"properties": {
"street": {"type": "string"},
"city": {"type": "string"},
"country": {"type": "string"}
},
"required": ["city", "country"]
}
},
"required": ["name", "age", "email"]
}
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": """
请从以下文本中提取用户信息并转换为JSON格式:
"张三,28岁,邮箱是zhangsan@example.com,擅长Python、JavaScript和机器学习。
住址:北京市朝阳区中国。"
"""
}
],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
print(json.dumps(result, indent=2, ensure_ascii=False))使用 Pydantic 模式
python
from pydantic import BaseModel
from typing import List, Optional
class Address(BaseModel):
street: Optional[str] = None
city: str
country: str
class UserProfile(BaseModel):
name: str
age: int
email: str
skills: List[str]
address: Address
def extract_user_info(text: str) -> UserProfile:
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"请提取用户信息,严格按照以下JSON模式输出:\n{UserProfile.model_json_schema()}"
},
{
"role": "user",
"content": text
}
],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return UserProfile(**data)
# 使用示例
text = "李四,32岁,邮箱lisi@example.com,擅长数据分析、SQL和可视化。住址:上海市浦东新区中国。"
profile = extract_user_info(text)
print(f"姓名: {profile.name}")
print(f"年龄: {profile.age}")
print(f"城市: {profile.address.city}")高级功能
复杂嵌套结构
python
complex_schema = {
"type": "object",
"properties": {
"company": {
"type": "object",
"properties": {
"name": {"type": "string"},
"employees": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"department": {"type": "string"},
"contact": {
"type": "object",
"properties": {
"email": {"type": "string", "format": "email"},
"phone": {"type": "string"}
}
}
},
"required": ["id", "name", "department"]
}
}
}
}
}
}
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": """
创建一个公司信息结构,包含:
- 公司名称:科技创新有限公司
- 员工信息:
* ID: 001, 姓名: 王五, 部门: 研发部, 邮箱: wangwu@company.com, 电话: 13800138000
* ID: 002, 姓名: 赵六, 部门: 市场部, 邮箱: zhaoliu@company.com, 电话: 13800138001
"""
}
],
response_format={"type": "json_object"}
)条件性和可选字段
python
conditional_schema = {
"type": "object",
"properties": {
"user_type": {
"type": "string",
"enum": ["individual", "business"]
},
"individual_info": {
"type": "object",
"properties": {
"first_name": {"type": "string"},
"last_name": {"type": "string"},
"personal_id": {"type": "string"}
},
"required": ["first_name", "last_name"]
},
"business_info": {
"type": "object",
"properties": {
"company_name": {"type": "string"},
"registration_number": {"type": "string"},
"contact_person": {"type": "string"}
},
"required": ["company_name", "registration_number"]
}
},
"required": ["user_type"]
}
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": """
根据用户类型提取相应信息:
- 如果是个人用户,填写 individual_info
- 如果是企业用户,填写 business_info
"""
},
{
"role": "user",
"content": "企业用户:ABC科技有限公司,注册号123456789,联系人张经理"
}
],
response_format={"type": "json_object"}
)数组和枚举
python
array_enum_schema = {
"type": "object",
"properties": {
"products": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "string"},
"name": {"type": "string"},
"category": {
"type": "string",
"enum": ["electronics", "clothing", "food", "books"]
},
"price": {"type": "number", "minimum": 0},
"tags": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["id", "name", "category", "price"]
}
}
}
}
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": """
产品列表:
1. 笔记本电脑,电子产品,价格5999元,标签:高性能、轻薄
2. T恤,服装,价格99元,标签:纯棉、舒适
3. 小说,图书,价格39元,标签:文学、畅销
"""
}
],
response_format={"type": "json_object"}
)实际应用场景
1. 数据提取和清洗
python
def extract_structured_data(text: str, schema: dict) -> dict:
"""从非结构化文本中提取结构化数据"""
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"请从文本中提取信息,严格按照以下JSON模式输出:\n{json.dumps(schema, indent=2)}"
},
{
"role": "user",
"content": text
}
],
response_format={"type": "json_object"},
temperature=0.1 # 低温度确保一致性
)
return json.loads(response.choices[0].message.content)
# 使用示例
invoice_schema = {
"type": "object",
"properties": {
"invoice_number": {"type": "string"},
"date": {"type": "string", "format": "date"},
"vendor": {"type": "string"},
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": {"type": "string"},
"quantity": {"type": "integer"},
"unit_price": {"type": "number"},
"total": {"type": "number"}
}
}
},
"total_amount": {"type": "number"}
}
}
invoice_text = """
发票号:INV-2024-001
日期:2024-01-15
供应商:ABC供应商
项目:
1. 办公用品 x10 单价25元 总计250元
2. 打印纸 x5 单价30元 总计150元
总金额:400元
"""
extracted_data = extract_structured_data(invoice_text, invoice_schema)
print(json.dumps(extracted_data, indent=2, ensure_ascii=False))2. API 响应生成
python
def generate_api_response(user_query: str, response_schema: dict) -> dict:
"""生成符合API响应格式的结构化数据"""
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"你是一个API服务,请根据用户查询生成符合以下模式的响应:\n{json.dumps(response_schema, indent=2)}"
},
{
"role": "user",
"content": user_query
}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
api_response_schema = {
"type": "object",
"properties": {
"success": {"type": "boolean"},
"data": {
"type": "object",
"properties": {
"results": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "string"},
"title": {"type": "string"},
"description": {"type": "string"},
"relevance_score": {"type": "number", "minimum": 0, "maximum": 1}
}
}
},
"total_count": {"type": "integer"},
"page": {"type": "integer"},
"page_size": {"type": "integer"}
}
},
"message": {"type": "string"},
"timestamp": {"type": "string", "format": "date-time"}
}
}
query = "搜索关于人工智能的文章"
api_response = generate_api_response(query, api_response_schema)3. 配置文件生成
python
def generate_config(requirements: str) -> dict:
"""根据需求生成配置文件"""
config_schema = {
"type": "object",
"properties": {
"database": {
"type": "object",
"properties": {
"host": {"type": "string"},
"port": {"type": "integer"},
"name": {"type": "string"},
"username": {"type": "string"},
"password": {"type": "string"}
}
},
"server": {
"type": "object",
"properties": {
"host": {"type": "string"},
"port": {"type": "integer"},
"debug": {"type": "boolean"},
"workers": {"type": "integer"}
}
},
"features": {
"type": "object",
"properties": {
"enable_cache": {"type": "boolean"},
"enable_logging": {"type": "boolean"},
"enable_monitoring": {"type": "boolean"}
}
}
}
}
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"根据需求生成配置文件,严格按照以下JSON模式:\n{json.dumps(config_schema, indent=2)}"
},
{
"role": "user",
"content": requirements
}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
requirements = """
需要一个Web应用配置:
- 数据库:MySQL,本地主机,端口3306,数据库名webapp
- 服务器:监听所有接口,端口8080,开启调试模式,4个工作进程
- 功能:启用缓存和日志,禁用监控
"""
config = generate_config(requirements)最佳实践
1. 模式设计
python
# 好的模式设计
good_schema = {
"type": "object",
"properties": {
"user_id": {
"type": "string",
"pattern": "^[a-zA-Z0-9]{8,20}$",
"description": "用户ID,8-20位字母数字"
},
"email": {
"type": "string",
"format": "email",
"description": "有效的邮箱地址"
},
"age": {
"type": "integer",
"minimum": 0,
"maximum": 150,
"description": "年龄范围0-150"
}
},
"required": ["user_id", "email"],
"additionalProperties": False # 禁止额外属性
}
# 避免的模式设计
bad_schema = {
"type": "object",
"properties": {
"data": {"type": "object"} # 过于宽泛
}
}2. 错误处理和验证
python
from pydantic import ValidationError
def safe_structured_output(text: str, schema_class):
"""安全的结构化输出生成"""
try:
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"严格按照以下JSON模式输出:\n{schema_class.model_json_schema()}"
},
{
"role": "user",
"content": text
}
],
response_format={"type": "json_object"},
temperature=0.1
)
data = json.loads(response.choices[0].message.content)
return schema_class(**data)
except json.JSONDecodeError as e:
print(f"JSON解析错误: {e}")
return None
except ValidationError as e:
print(f"数据验证错误: {e}")
return None
except Exception as e:
print(f"其他错误: {e}")
return None
# 使用示例
result = safe_structured_output(user_input, UserProfile)
if result:
print("生成成功:", result)
else:
print("生成失败")3. 性能优化
python
# 缓存常用模式
from functools import lru_cache
@lru_cache(maxsize=50)
def get_schema_string(schema_class):
"""缓存模式字符串"""
return json.dumps(schema_class.model_json_schema(), indent=2)
# 批量处理
def batch_structured_extraction(texts: list, schema: dict) -> list:
"""批量结构化数据提取"""
results = []
for text in texts:
try:
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"严格按照以下JSON模式输出:\n{json.dumps(schema, indent=2)}"
},
{
"role": "user",
"content": text
}
],
response_format={"type": "json_object"},
temperature=0.1
)
result = json.loads(response.choices[0].message.content)
results.append(result)
except Exception as e:
print(f"处理失败: {e}")
results.append(None)
return results限制和注意事项
- 模式复杂度:过于复杂的模式可能影响生成质量
- 数据一致性:建议使用低温度参数确保一致性
- 验证机制:始终验证生成的数据是否符合模式
- 性能考虑:结构化输出可能比普通输出稍慢
- 模型支持:确保使用的模型支持结构化输出功能
调试和测试
python
def test_structured_output():
"""测试结构化输出功能"""
test_cases = [
{
"input": "用户:张三,25岁,邮箱zhangsan@test.com",
"schema": UserProfile.model_json_schema(),
"expected_fields": ["name", "age", "email"]
}
]
for case in test_cases:
result = safe_structured_output(case["input"], UserProfile)
if result:
for field in case["expected_fields"]:
assert hasattr(result, field), f"缺少字段: {field}"
print(f"测试通过: {case['input']}")
else:
print(f"测试失败: {case['input']}")
# 运行测试
test_structured_output()