Skip to content

结构化输出(Structured Outputs)

结构化输出是大语言模型的一项重要功能,允许模型生成符合特定格式和模式的结构化数据,如 JSON、XML 等格式,便于程序化处理和集成。

概述

结构化输出的优势:

  • 数据格式一致性
  • 易于程序解析
  • 减少后处理工作
  • 提高数据质量
  • 支持复杂嵌套结构

支持的格式

JSON 格式

  • 对象和数组
  • 基本数据类型
  • 嵌套结构
  • 枚举值

其他格式

  • XML
  • YAML
  • CSV
  • 自定义格式

基本用法

JSON 模式定义

python
import openai
import json

client = openai.OpenAI(
    api_key="your_api_key",
    base_url="https://realmrouter.cn/v1"
)

# 定义 JSON 模式
json_schema = {
    "type": "object",
    "properties": {
        "name": {
            "type": "string",
            "description": "用户姓名"
        },
        "age": {
            "type": "integer",
            "description": "用户年龄",
            "minimum": 0,
            "maximum": 150
        },
        "email": {
            "type": "string",
            "format": "email",
            "description": "邮箱地址"
        },
        "skills": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "技能列表"
        },
        "address": {
            "type": "object",
            "properties": {
                "street": {"type": "string"},
                "city": {"type": "string"},
                "country": {"type": "string"}
            },
            "required": ["city", "country"]
        }
    },
    "required": ["name", "age", "email"]
}

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "user",
            "content": """
            请从以下文本中提取用户信息并转换为JSON格式:
            
            "张三,28岁,邮箱是zhangsan@example.com,擅长Python、JavaScript和机器学习。
            住址:北京市朝阳区中国。"
            """
        }
    ],
    response_format={"type": "json_object"}
)

result = json.loads(response.choices[0].message.content)
print(json.dumps(result, indent=2, ensure_ascii=False))

使用 Pydantic 模式

python
from pydantic import BaseModel
from typing import List, Optional

class Address(BaseModel):
    street: Optional[str] = None
    city: str
    country: str

class UserProfile(BaseModel):
    name: str
    age: int
    email: str
    skills: List[str]
    address: Address

def extract_user_info(text: str) -> UserProfile:
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": f"请提取用户信息,严格按照以下JSON模式输出:\n{UserProfile.model_json_schema()}"
            },
            {
                "role": "user",
                "content": text
            }
        ],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(response.choices[0].message.content)
    return UserProfile(**data)

# 使用示例
text = "李四,32岁,邮箱lisi@example.com,擅长数据分析、SQL和可视化。住址:上海市浦东新区中国。"
profile = extract_user_info(text)
print(f"姓名: {profile.name}")
print(f"年龄: {profile.age}")
print(f"城市: {profile.address.city}")

高级功能

复杂嵌套结构

python
complex_schema = {
    "type": "object",
    "properties": {
        "company": {
            "type": "object",
            "properties": {
                "name": {"type": "string"},
                "employees": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "id": {"type": "integer"},
                            "name": {"type": "string"},
                            "department": {"type": "string"},
                            "contact": {
                                "type": "object",
                                "properties": {
                                    "email": {"type": "string", "format": "email"},
                                    "phone": {"type": "string"}
                                }
                            }
                        },
                        "required": ["id", "name", "department"]
                    }
                }
            }
        }
    }
}

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "user",
            "content": """
            创建一个公司信息结构,包含:
            - 公司名称:科技创新有限公司
            - 员工信息:
              * ID: 001, 姓名: 王五, 部门: 研发部, 邮箱: wangwu@company.com, 电话: 13800138000
              * ID: 002, 姓名: 赵六, 部门: 市场部, 邮箱: zhaoliu@company.com, 电话: 13800138001
            """
        }
    ],
    response_format={"type": "json_object"}
)

条件性和可选字段

python
conditional_schema = {
    "type": "object",
    "properties": {
        "user_type": {
            "type": "string",
            "enum": ["individual", "business"]
        },
        "individual_info": {
            "type": "object",
            "properties": {
                "first_name": {"type": "string"},
                "last_name": {"type": "string"},
                "personal_id": {"type": "string"}
            },
            "required": ["first_name", "last_name"]
        },
        "business_info": {
            "type": "object",
            "properties": {
                "company_name": {"type": "string"},
                "registration_number": {"type": "string"},
                "contact_person": {"type": "string"}
            },
            "required": ["company_name", "registration_number"]
        }
    },
    "required": ["user_type"]
}

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "system",
            "content": """
            根据用户类型提取相应信息:
            - 如果是个人用户,填写 individual_info
            - 如果是企业用户,填写 business_info
            """
        },
        {
            "role": "user",
            "content": "企业用户:ABC科技有限公司,注册号123456789,联系人张经理"
        }
    ],
    response_format={"type": "json_object"}
)

数组和枚举

python
array_enum_schema = {
    "type": "object",
    "properties": {
        "products": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "id": {"type": "string"},
                    "name": {"type": "string"},
                    "category": {
                        "type": "string",
                        "enum": ["electronics", "clothing", "food", "books"]
                    },
                    "price": {"type": "number", "minimum": 0},
                    "tags": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["id", "name", "category", "price"]
            }
        }
    }
}

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "user",
            "content": """
            产品列表:
            1. 笔记本电脑,电子产品,价格5999元,标签:高性能、轻薄
            2. T恤,服装,价格99元,标签:纯棉、舒适
            3. 小说,图书,价格39元,标签:文学、畅销
            """
        }
    ],
    response_format={"type": "json_object"}
)

实际应用场景

1. 数据提取和清洗

python
def extract_structured_data(text: str, schema: dict) -> dict:
    """从非结构化文本中提取结构化数据"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": f"请从文本中提取信息,严格按照以下JSON模式输出:\n{json.dumps(schema, indent=2)}"
            },
            {
                "role": "user",
                "content": text
            }
        ],
        response_format={"type": "json_object"},
        temperature=0.1  # 低温度确保一致性
    )
    
    return json.loads(response.choices[0].message.content)

# 使用示例
invoice_schema = {
    "type": "object",
    "properties": {
        "invoice_number": {"type": "string"},
        "date": {"type": "string", "format": "date"},
        "vendor": {"type": "string"},
        "items": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "description": {"type": "string"},
                    "quantity": {"type": "integer"},
                    "unit_price": {"type": "number"},
                    "total": {"type": "number"}
                }
            }
        },
        "total_amount": {"type": "number"}
    }
}

invoice_text = """
发票号:INV-2024-001
日期:2024-01-15
供应商:ABC供应商
项目:
1. 办公用品 x10 单价25元 总计250元
2. 打印纸 x5 单价30元 总计150元
总金额:400元
"""

extracted_data = extract_structured_data(invoice_text, invoice_schema)
print(json.dumps(extracted_data, indent=2, ensure_ascii=False))

2. API 响应生成

python
def generate_api_response(user_query: str, response_schema: dict) -> dict:
    """生成符合API响应格式的结构化数据"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": f"你是一个API服务,请根据用户查询生成符合以下模式的响应:\n{json.dumps(response_schema, indent=2)}"
            },
            {
                "role": "user",
                "content": user_query
            }
        ],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

api_response_schema = {
    "type": "object",
    "properties": {
        "success": {"type": "boolean"},
        "data": {
            "type": "object",
            "properties": {
                "results": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "id": {"type": "string"},
                            "title": {"type": "string"},
                            "description": {"type": "string"},
                            "relevance_score": {"type": "number", "minimum": 0, "maximum": 1}
                        }
                    }
                },
                "total_count": {"type": "integer"},
                "page": {"type": "integer"},
                "page_size": {"type": "integer"}
            }
        },
        "message": {"type": "string"},
        "timestamp": {"type": "string", "format": "date-time"}
    }
}

query = "搜索关于人工智能的文章"
api_response = generate_api_response(query, api_response_schema)

3. 配置文件生成

python
def generate_config(requirements: str) -> dict:
    """根据需求生成配置文件"""
    config_schema = {
        "type": "object",
        "properties": {
            "database": {
                "type": "object",
                "properties": {
                    "host": {"type": "string"},
                    "port": {"type": "integer"},
                    "name": {"type": "string"},
                    "username": {"type": "string"},
                    "password": {"type": "string"}
                }
            },
            "server": {
                "type": "object",
                "properties": {
                    "host": {"type": "string"},
                    "port": {"type": "integer"},
                    "debug": {"type": "boolean"},
                    "workers": {"type": "integer"}
                }
            },
            "features": {
                "type": "object",
                "properties": {
                    "enable_cache": {"type": "boolean"},
                    "enable_logging": {"type": "boolean"},
                    "enable_monitoring": {"type": "boolean"}
                }
            }
        }
    }
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": f"根据需求生成配置文件,严格按照以下JSON模式:\n{json.dumps(config_schema, indent=2)}"
            },
            {
                "role": "user",
                "content": requirements
            }
        ],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

requirements = """
需要一个Web应用配置:
- 数据库:MySQL,本地主机,端口3306,数据库名webapp
- 服务器:监听所有接口,端口8080,开启调试模式,4个工作进程
- 功能:启用缓存和日志,禁用监控
"""

config = generate_config(requirements)

最佳实践

1. 模式设计

python
# 好的模式设计
good_schema = {
    "type": "object",
    "properties": {
        "user_id": {
            "type": "string",
            "pattern": "^[a-zA-Z0-9]{8,20}$",
            "description": "用户ID,8-20位字母数字"
        },
        "email": {
            "type": "string",
            "format": "email",
            "description": "有效的邮箱地址"
        },
        "age": {
            "type": "integer",
            "minimum": 0,
            "maximum": 150,
            "description": "年龄范围0-150"
        }
    },
    "required": ["user_id", "email"],
    "additionalProperties": False  # 禁止额外属性
}

# 避免的模式设计
bad_schema = {
    "type": "object",
    "properties": {
        "data": {"type": "object"}  # 过于宽泛
    }
}

2. 错误处理和验证

python
from pydantic import ValidationError

def safe_structured_output(text: str, schema_class):
    """安全的结构化输出生成"""
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": f"严格按照以下JSON模式输出:\n{schema_class.model_json_schema()}"
                },
                {
                    "role": "user",
                    "content": text
                }
            ],
            response_format={"type": "json_object"},
            temperature=0.1
        )
        
        data = json.loads(response.choices[0].message.content)
        return schema_class(**data)
        
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {e}")
        return None
    except ValidationError as e:
        print(f"数据验证错误: {e}")
        return None
    except Exception as e:
        print(f"其他错误: {e}")
        return None

# 使用示例
result = safe_structured_output(user_input, UserProfile)
if result:
    print("生成成功:", result)
else:
    print("生成失败")

3. 性能优化

python
# 缓存常用模式
from functools import lru_cache

@lru_cache(maxsize=50)
def get_schema_string(schema_class):
    """缓存模式字符串"""
    return json.dumps(schema_class.model_json_schema(), indent=2)

# 批量处理
def batch_structured_extraction(texts: list, schema: dict) -> list:
    """批量结构化数据提取"""
    results = []
    
    for text in texts:
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {
                        "role": "system",
                        "content": f"严格按照以下JSON模式输出:\n{json.dumps(schema, indent=2)}"
                    },
                    {
                        "role": "user",
                        "content": text
                    }
                ],
                response_format={"type": "json_object"},
                temperature=0.1
            )
            
            result = json.loads(response.choices[0].message.content)
            results.append(result)
            
        except Exception as e:
            print(f"处理失败: {e}")
            results.append(None)
    
    return results

限制和注意事项

  1. 模式复杂度:过于复杂的模式可能影响生成质量
  2. 数据一致性:建议使用低温度参数确保一致性
  3. 验证机制:始终验证生成的数据是否符合模式
  4. 性能考虑:结构化输出可能比普通输出稍慢
  5. 模型支持:确保使用的模型支持结构化输出功能

调试和测试

python
def test_structured_output():
    """测试结构化输出功能"""
    test_cases = [
        {
            "input": "用户:张三,25岁,邮箱zhangsan@test.com",
            "schema": UserProfile.model_json_schema(),
            "expected_fields": ["name", "age", "email"]
        }
    ]
    
    for case in test_cases:
        result = safe_structured_output(case["input"], UserProfile)
        
        if result:
            for field in case["expected_fields"]:
                assert hasattr(result, field), f"缺少字段: {field}"
            print(f"测试通过: {case['input']}")
        else:
            print(f"测试失败: {case['input']}")

# 运行测试
test_structured_output()

基于 MIT 许可发布 厦门界云聚算网络科技有限公司