使用Python高效读取ZIP压缩文件中的JSON数据

2025-10-15 09:41 开发作者： weixin_30777913

准备工作与环境设置

在开始之前，确保已安装必要的Python库：

pip install pandas pyspark pyarrow

使用标准库方法读取ZIP中的JSON

Python的标准库zipfile提供了处理ZIP文件的基本功能。以下是基础读取方法：

import zipfile
import json
import pandas as pd
from pyspark.sql import SparkSession

# 初始化Spark会话
spark = SparkSession.builder \
    .appName("ZIP_JSON_Reader") \
    .getOrCreate()

高效读取方法

方法1：使用zipfile和io模块

import zipfile
import json
import io

def read_json_from_zip_basic(zip_path, json_filename):
    """基础方法：读取ZIP中的单个JSON文件"""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        with zip_ref.open(json_filename, 'r') as json_file:
            # 读取并解析JSON内容
            json_data = json.loads(json_file.read().decode('utf-8'))
            return json_data

方法2：批量处理ZIP中的多个JSON文件

def read_multiple_json_from_zip(zip_path, file_extension='.json'):
    """读取ZIP中所有JSON文件"""
    all_data = []
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # 获取所有JSON文件
        json_files = [f for f in zip_ref.namelist() 
                     if f.endswith(file_extension)]
        
        for json_file in json_files:
            with zip_ref.open(json_file, 'r') as file:
                try:
                    json_data = json.loads(file.read().decode('utf-8'))
                    all_data.append(json_data)
                except json.JSONDecodeError as e:
                    print(f"Error reading {json_file}: {e}")
    
    return all_data

转换为Pandas DataFrame

方法1：直接转换

def zip_json_to_pandas_simple(zip_path, json_filename):
    """将ZIP中的JSON文件转换为Pandas DataFrame（简单版）"""
    json_data = read_json_from_zip_basic(zip_path, json_filename)
    
    # 如果JSON是数组格式，直接转换为DataFrame
    if isinstance(json_data, list):
        return pd.DataFrame(json_data)
    # 如果JSON是对象格式，可能需要特殊处理
    else:
        return pd.DataFrame([json_data])

方法2：使用pandas直接读取（推荐）

def zip_json_to_pandas_efficient(zip_path, json_filename):
    """高效方法：使用pandas直接读取ZIP中的JSON文件"""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        with zip_ref.open(json_filename, 'r') as json_file:
            # 使用pandas直接读取JSON流
            df = pd.read_json(json_file, encoding='utf-8')
            return df

方法3：处理大型JSON文件

import ijson

def read_large_json_from_zip(zip_path, json_filename):
    """使用流式处理读取大型JSON文件"""
    items = []
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        with zip_ref.open(json_filename, 'r') as json_file:
            # 使用ijson进行流式解析
           LOtKUmTm parser = ijson.parse(json_file)
            
            for prefix, event, value in parser:
                # 根据JSON结构进行相应处理
                if event == 'start_array' or event == 'end_array':
                    continue
                # 这里需要根据实际JSON结构调整解析逻辑
                
    return pd.DataFrame(items)

转换为PySpark DataFrame

方法1：通过Pandas中转

def zip_json_to_pyspark_via_pandas(zip_path, json_filename):
    """通过Pandas将ZIP中的JSON转换为PySpark DataFrame"""
    # 先读取为Pandas DataFrame
    pandas_df = zip_json_to_pandas_efficient(zip_path, json_filename)
    
    # 转换为PySpark DataFrame
    spark_df = spark.createDataFrame(pandas_df)
    return spark_df

方法2：直接使用PySpark读取（需解压）

import tempfile
import os

def zip_json_to_pyspark_direct(zip_path, json_filename):
    """将ZIP文件解压后使用PySpark直接读取"""
    with tempfile.TemporaryDirectory() as temp_dir:
        # 解压ZIP文件
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extract(json_filename, temp_dir)
        
        # 使用PySpark读取解压后的JSON文件
        json_path = os.path.join(temp_dir, json_filename)
        spark_df = spark.read \
            .option("encoding", "UTF-8") \
            .json(json_path)
        
        return spark_df

方法3：处理ZIP中的多个JSON文件

def multiple_zip_json_to_pyspark(zip_path):
    """读取ZIP中所有JSON文件到PySpark DataFrame"""
    all_dfs = []
    
    with tempfile.TemporaryDirectory() as temp_dir:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # 解压所有JSON文件
            json_files = [f for f in zip_ref.namelist() if f.endswith('.json')]
            zip_ref.extractall(temp_dir, json_files)
        
        # 读取所有JSON文件
        for json_file in json_files:
            json_path = os.path.join(temp_dir, json_file)
            df = spark.read.option("encoding", "UTF-8http://www.devze.com").json(json_path)
            all_dfs.append(df)
    
    # 合并所有DataFrame
    if all_dfs:
        result_df = all_dfs[0]
        for df in all_dfs[1:]:
            result_df = result_df.union(df)
        return result_df
    else:
        return spark.createDataFrame([], schema=None)

处理大型ZIP文件的策略

方法1：分块读取

def read_large_zip_json_chunked(zip_path, json_filename, chunk_size=1000):
    """分块读取大型ZIP中的JSON文件"""
    chunks = []
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        with zip_ref.open(json_filename, 'r') as json_file:
            # 使用pandas的分块读取功能
            for chunk in pd.read_json(json_file, encoding='utf-8', 
js                                    lines=True, chunksize=chunk_size):
                chunks.append(chunk)
    
    # 合并所有块
    if chunks:
        return pd.concat(chunks, ignore_index=True)
    else:
        return pd.DataFrame()

方法2：使用内存映射

def read_zip_json_with_mmap(zip_path, json_filename):
    """使用内存映射处理大型ZIP文件"""
    import mmap
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # 获取文件信息
        file_info = zip_ref.getinfo(json_filename)
        
        with zip_ref.open(json_filename, 'r') as json_file:
            # 创建内存映射
            with mmap.mmap(json_file.fileno(), 0, Access=mmap.ACCESS_READ) as mmapped_file:
                df = pd.read_json(mmapped_file, encoding='utf-8')
                return df

性能优化建议

1. 使用适当的数据类型

def optimize_pandas_dataframe(df):
    """优化Pandas DataFrame的内存使用"""
    # 转换数据类型以减少内存使用
    for col in df.columns:
        if df[col].dtype == 'object':
            # 尝试转换为分类类型
            if df[col].nunique() / len(df) < 0.5:
                df[col] = df[col].astype('category')
        
        # 转换数值类型
        elif df[col].dtype in ['int64', 'float64']:
            df[col] = pd.to_numeric(df[col], downcast='integer')
    
    return df

2. 并行处理

from concurrent.futures import ThreadPoolExecutor

def parallel_zip_processing(zip_paths, processing_function, max_workers=4):
    """并行处理多个ZIP文件"""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(processing_function, zip_paths))
    return results

完整示例代码

import zipfile
import json
import pandas as pd
from pyspark.sql import SparkSession
import tempfile
import os

class ZIPJSONReader:
    """ZIP文件中的JSON读取器"""
    
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("ZIP_JSON_Reader") \
            .getOrCreate()
    
    def read_to_pandas(self, zip_path, json_filename=None, optimize=True):
        """读取ZIP中的JSON文件到Pandas DataFrame"""
        
        # 如果未指定文件名，自动查找第一个JSON文件
        if json_filename is None:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                json_files = [f for f in zip_ref.namelist() 
                            if f.endswith('.json')]
                if not json_files:
                    raise ValueError("No JSON files found in ZIP")
                json_filename = json_files[0]
        
        # 读编程客栈取JSON文件
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            with zip_ref.open(json_filename, 'r') as json_file:
                df = pd.read_json(json_file, encoding='utf-8')
        
        # 优化内存使用
        if optimize:
            df = self._optimize_dataframe(df)
        
        return df
    
    def read_to_pyspark(self, zip_path, json_filename=None):
        """读取ZIP中的JSON文件到PySpark DataFrame"""
        
        # 使用临时目录解压文件
        with tempfile.TemporaryDirectory() as temp_dir:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                if json_filename:
                    # 解压指定文件
                    zip_ref.extract(json_filename, temp_dir)
                    json_path = os.path.join(temp_dir, json_filename)
                else:
                    # 解压所有JSON文件
                    json_files = [f for f in zip_ref.namelist() 
                                if f.endswith('.json')]
                    if not json_files:
                        raise ValueError("No JSON files found in ZIP")
                    zip_ref.extractall(temp_dir, json_files)
                    json_path = temp_dir
            
            # 使用PySpark读取
            df = self.spark.read \
                .option("encoding", "UTF-8") \
                .json(json_path)
            
            return df
    
    def _optimize_dataframe(self, df):
        """优化DataFrame内存使用"""
        for col in df.columns:
            col_type = df[col].dtype
            
            if col_type == 'object':
                # 转换为分类类型
                num_unique_values = len(df[col].unique())
                num_total_values = len(df[col])
                if num_unique_values / num_total_values < 0.5:
                    df[col] = df[col].astype('category')
            elif col_type in ['int64']:
                # 下转换整数类型
                df[col] = pd.to_numeric(df[col], downcaLOtKUmTmst='integer')
            elif col_type in ['float64']:
                # 下转换浮点类型
                df[col] = pd.to_numeric(df[col], downcast='float')
        
        return df
    
    def close(self):
        """关闭Spark会话"""
        self.spark.stop()

# 使用示例
if __name__ == "__main__":
    reader = ZIPJSONReader()
    
    try:
        # 读取到Pandas
        pandas_df = reader.read_to_pandas('data.zip', 'example.json')
        print("Pandas DataFrame:")
        print(pandas_df.head())
        print(f"Pandas DataFrame shape: {pandas_df.shape}")
        
        # 读取到PySpark
        spark_df = reader.read_to_pyspark('data.zip', 'example.json')
        print("\nPySpark DataFrame:")
        spark_df.show(5)
        print(f"PySpark DataFrame count: {spark_df.count()}")
        
    finally:
        reader.close()

总结

本文介绍了多种高效读取ZIP压缩文件中UTF-8编码JSON数据的方法：

对于Pandas DataFrame：

使用zipfile和pandas.read_json直接读取
处理大型文件时使用分块读取
优化数据类型以减少内存使用

对于PySpark DataFrame：

通过Pandas中转（适合中小型数据）
解压后直接读取（适合大型数据）
支持处理ZIP中的多个JSON文件

性能优化：

使用适当的数据类型
并行处理多个文件
流式处理大型文件

根据数据大小和处理需求选择合适的方法，可以在保证性能的同时高效地处理ZIP压缩文件中的JSON数据。

以上就是使用Python高效读取ZIP压缩文件中的JSON数据的详细内容，更多关于Python读取ZIP文件的JSON数据的资料请关注编程客栈(www.devze.com)其它相关文章！

继续阅读：Python读取ZIP文件 Python读取ZIP文件的JSON数据 Python读取文件数据

使用Python高效读取ZIP压缩文件中的JSON数据

目录

准备工作与环境设置

使用标准库方法读取ZIP中的JSON

高效读取方法

方法1：使用zipfile和io模块

方法2：批量处理ZIP中的多个JSON文件

转换为Pandas DataFrame

方法1：直接转换

方法2：使用pandas直接读取（推荐）

方法3：处理大型JSON文件

转换为PySpark DataFrame

方法1：通过Pandas中转

方法2：直接使用PySpark读取（需解压）

方法3：处理ZIP中的多个JSON文件

处理大型ZIP文件的策略

方法1：分块读取

方法2：使用内存映射

性能优化建议

1. 使用适当的数据类型

2. 并行处理

完整示例代码

总结

更多精彩内容

精彩评论

最新开发

C++分布式语音识别服务实践方案

C#屏幕录制中遇到黑屏问题的原因和解决方法

C#利用.NET实现Word文档到HTML的高效转换

SpringBoot Maven的操作与配置图文教程

Spring Boot 整合第三方组件Redis、MyBatis、Kafka 实战案例指南

开发排行榜

springboot后端存储富文本内容的思路与步骤(含图片内容)

PyCharm运行python测试,报错“没有发现测试”/“空套件”的解决

return base64.b64encode(b).decode(

基于C语言实现钻石棋游戏的示例代码

Sublime Text 3解决中文乱码问题（实测可用）