DeepSeek-OCR 实测：多模态OCR模型性能与应用解析

去年12月，DeepSeek大模型曾横空出世，官方宣传其性能超越国际一流大模型，一度被誉为“国产大模型之光”。当时，朋友圈、媒体和技术社区都对其寄予厚望，认为其点亮了AI的未来。

然而，DeepSeek的实际表现并未完全达到宣传时的高度，在与其他大模型的对比中，未能充分展现出其宣称的卓越实力。

时隔10个月，DeepSeek团队低调发布了一款名为DeepSeek-OCR的小工具。这款模型没有进行大规模营销，却在实际使用中展现出了令人惊喜的识别效果。

DeepSeek-OCR 是什么？

DeepSeek-OCR是一款先进的多模态OCR模型，能够精准识别图片、PDF、扫描件中的文字、表格、数学公式，并完整保留原始文档的排版结构。

相较于传统OCR工具，DeepSeek-OCR具有以下显著优势：

复杂排版支持：能准确还原图文混排、复杂表格和数学公式。
多语言识别：不仅支持中文，还能有效处理英文、日文等多种语言内容。
结构化输出：可直接生成Markdown、JSON等格式，极大地便利了后续的数据处理和应用开发。
开源可用性：模型可在Hugging Face平台下载，部署过程简便快捷。

DeepSeek-OCR 实测体验

本测试对DeepSeek-OCR进行了两类文档的识别：

普通图片文字：识别率接近100%，几乎没有错字。如下图所示，左侧为手机拍摄的图片，右侧为DeepSeek-OCR程序运行后的识别结果对比：

DeepSeek-OCR识别手机拍照图片效果对比

PDF文档（数学试卷，图文混排，含公式和表格）：识别效果令人印象深刻。段落、标题、表格均得到完整保留，结构清晰。数学公式的识别准确率接近100%，LaTeX表达式输出非常稳定。以下为一份初中一年级数学试卷的识别效果展示：

DeepSeek-OCR识别初中数学试卷（含公式）第一页 DeepSeek-OCR识别初中数学试卷（含公式）第二页
相比之下，传统OCR工具在处理数学公式和复杂排版时常出现识别错误，而DeepSeek-OCR则实现了“无痛还原”。其输出文本几乎与原始试卷一模一样，公式、表格、段落均被完整保留。

DeepSeek-OCR 测试代码示例

"""DeepSeek-OCR 测试脚本"""
from transformers import AutoModel, AutoTokenizer
import torch
import os
import time
import psutil
import GPUtil
from pathlib import Path
from PIL import Image
import fitz  # PyMuPDF

# ============ 配置区 ============
# 1. 模型路径
model_name = 'deepseek-ai/DeepSeek-OCR'  # 从 HuggingFace 下载
# 2. 设置图片/PDF路径
IMAGE_PATH = 'images/2024数学练习.pdf'
# 3. 设置输出目录
OUTPUT_DIR = './output'
# 4. 设置任务类型：'markdown' 或 'ocr'
TASK = 'markdown'
# 5. GPU 设备
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# =================================

def main():
    print("=" * 70)
    print("加载模型…")
    # 记录初始状态
    process = psutil.Process()
    initial_memory = process.memory_info().rss / 1024 / 1024  # MB

    # 加载模型
    load_start = time.time()
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        local_files_only=True  # 只使用本地缓存，不联网检查更新
    )
    model = AutoModel.from_pretrained(
        model_name,
        trust_remote_code=True,
        local_files_only=True  # 只使用本地缓存，不联网检查更新
    )
    model = model.eval().cuda().to(torch.bfloat16)
    load_time = time.time() - load_start

    # 记录加载后状态
    after_load_memory = process.memory_info().rss / 1024 / 1024  # MB
    gpu = GPUtil.getGPUs()[0] if GPUtil.getGPUs() else None
    print(f"✓ 模型加载完成 (耗时: {load_time:.2f}秒)")
    print(f"内存占用: {after_load_memory - initial_memory:.2f} MB")
    if gpu:
        print(f"显存占用: {gpu.memoryUsed:.2f} MB / {gpu.memoryTotal:.2f} MB")
    print("=" * 70)

    # 设置提示词
    if TASK == 'markdown':
        prompt = "<image>
<|grounding|>Convert the document to markdown. "
    else:
        prompt = "<image>
<|grounding|>Free OCR. "

    # 检查文件类型并转换PDF
    file_path = Path(IMAGE_PATH)
    if file_path.suffix.lower() == '.pdf':
        print(f"
检测到PDF文件: {IMAGE_PATH}")
        print("正在转换PDF为图片…")
        # 打开PDF
        pdf_doc = fitz.open(IMAGE_PATH)
        total_pages = len(pdf_doc)
        print(f"PDF共 {total_pages} 页")
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        all_results = []
        total_infer_time = 0

        # 逐页处理
        for page_num in range(total_pages):
            print(f"
{'='*70}")
            print(f"处理第 {page_num + 1}/{total_pages} 页")
            print("-" * 70)
            # 转换当前页为图片
            page = pdf_doc[page_num]
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            # 保存临时图片
            temp_image_path = f'{OUTPUT_DIR}/temp_page_{page_num + 1}.png'
            pix.save(temp_image_path)

            # 记录推理前状态
            infer_start = time.time()
            cpu_percent_start = psutil.cpu_percent(interval=0.1)
            gpu_util_start = gpu.load * 100 if gpu else 0

            # 执行OCR
            page_output_dir = f'{OUTPUT_DIR}/page_{page_num + 1}'
            result = model.infer(
                tokenizer,
                prompt=prompt,
                image_file=temp_image_path,
                output_path=page_output_dir,
                base_size=1024,
                image_size=640,
                crop_mode=True,
                save_results=True
            )

            # 记录推理后状态
            infer_time = time.time() - infer_start
            total_infer_time += infer_time

            # 读取保存的结果文件
            result_file = f'{page_output_dir}/result.mmd'
            if os.path.exists(result_file):
                with open(result_file, 'r', encoding='utf-8') as f:
                    page_result = f.read()

                # 复制该页的images目录到output/images下
                page_images_dir = f'{page_output_dir}/images'
                if os.path.exists(page_images_dir):
                    output_images_dir = f'{OUTPUT_DIR}/images'
                    os.makedirs(output_images_dir, exist_ok=True)
                    # 复制图片并重命名（添加页码前缀）
                    import shutil
                    for img_file in os.listdir(page_images_dir):
                        src = os.path.join(page_images_dir, img_file)
                        dst = os.path.join(output_images_dir, f'page{page_num + 1}_{img_file}')
                        shutil.copy2(src, dst)
                    # 更新结果中的图片路径
                    page_result = page_result.replace('](images/', f'](images/page{page_num + 1}_')
                all_results.append(f"

# 第 {page_num + 1} 页

{page_result}")
                print(f"✓ 第 {page_num + 1} 页识别完成 (耗时: {infer_time:.2f}秒)")
            else:
                print(f"✗ 第 {page_num + 1} 页识别失败")

        pdf_doc.close()

        # 合并所有页结果
        result = "
".join(all_results)

        # 保存完整结果
        with open(f'{OUTPUT_DIR}/full_result.md', 'w', encoding='utf-8') as f:
            f.write(result)

        print(f"
{'='*70}")
        print("PDF处理完成")
        print("=" * 70)
        print(f"总页数:        {total_pages}")
        print(f"总耗时:        {total_infer_time:.2f} 秒")
        print(f"平均每页:      {total_infer_time/total_pages:.2f} 秒")

        # 获取最终状态
        final_memory = process.memory_info().rss / 1024 / 1024
        cpu_percent_end = psutil.cpu_percent(interval=0.1)
        if gpu:
            gpu = GPUtil.getGPUs()[0]
            gpu_util_end = gpu.load * 100
        else:
            gpu_util_end = 0
        print(f"内存使用:      {final_memory:.2f} MB")
        if gpu:
            print(f"显存占用:      {gpu.memoryUsed:.2f} MB / {gpu.memoryTotal:.2f} MB")
        print("=" * 70)
    else:
        # 处理单个图片
        process_path = IMAGE_PATH
        print(f"
处理文件: {IMAGE_PATH}")
        print("-" * 70)
        # 记录推理前状态
        infer_start = time.time()
        cpu_percent_start = psutil.cpu_percent(interval=0.1)
        gpu_util_start = gpu.load * 100 if gpu else 0

        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=process_path,
            output_path=OUTPUT_DIR,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            save_results=True
        )

        # 记录推理后状态
        infer_time = time.time() - infer_start
        cpu_percent_end = psutil.cpu_percent(interval=0.1)
        final_memory = process.memory_info().rss / 1024 / 1024
        if gpu:
            gpu = GPUtil.getGPUs()[0]
            gpu_util_end = gpu.load * 100
        else:
            gpu_util_end = 0

        # 输出性能统计
        print("-" * 70)
        print("
性能统计:")
        print("=" * 70)
        print(f"推理耗时:      {infer_time:.2f} 秒")
        print(f"CPU 使用率:    {cpu_percent_end:.1f}%")
        print(f"内存使用:      {final_memory:.2f} MB (推理增加: {final_memory - after_load_memory:.2f} MB)")
        if gpu:
            print(f"GPU 使用率:    {gpu_util_end:.1f}%")
            print(f"显存占用:      {gpu.memoryUsed:.2f} MB / {gpu.memoryTotal:.2f} MB ({gpu.memoryUsed/gpu.memoryTotal*100:.1f}%)")
        print("=" * 70)

    # 显示结果预览
    print(f"
识别结果预览:")
    print("-" * 70)
    if result:
        preview = result[:300] if len(result) > 300 else result
        print(preview)
        if len(result) > 300:
            print(f"
… (共 {len(result)} 字符)")
    else:
        print("未获取到结果")
    print("-" * 70)
    print(f"
✓ 完整结果已保存到: {OUTPUT_DIR}/full_result.md" if file_path.suffix.lower() == '.pdf' else f"
✓ 完整结果已保存到: {OUTPUT_DIR}")

if __name__ == '__main__':
    main()

DeepSeek-OCR 实测：多模态OCR模型性能与应用解析

DeepSeek-OCR 是什么？

DeepSeek-OCR 实测体验

DeepSeek-OCR 测试代码示例

发表回复取消回复

最新内容

《亚洲水发展展望2025》深度解读：亚太水安全喜忧参半，未来挑战何在？

谷歌支付6800万美元和解语音助手监听诉讼，你的隐私可能被“误触发”录音

甲骨文豪掷500亿美元押注AI基建，美国数据中心版图加速扩张

OpenAI总裁豪掷2500万美元支持特朗普，科技巨头与政坛的深度捆绑引关注

相关内容

国产AI模型与Anthropic对比：为何Anthropic被赞“大善人”？兼论国产模型现状与破局之道

构建AI原生应用：从大模型到知识中台，数据与规则逻辑，探究模型、知识、价值原生及与AI赋能的差异

RAG Chunking 2.0：提升文档分块效果的八大实用策略与Python示例

OpenAI 将 ChatGPT 升级为操作系统：支持第三方应用与MCP集成，发布AgentKit

分类

快速链接

DeepSeek-OCR 是什么？

DeepSeek-OCR 实测体验

You Might Also Like

DeepSeek-OCR 测试代码示例

发表回复 取消回复

最新内容

分类

快速链接

发表回复取消回复