当前位置:网站首页>Large CSV split and merge
Large CSV split and merge
2022-07-03 15:32:00 【ASKCOS】
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
class PyCSV:
def merge_csv(self, save_name, file_dir, csv_encoding='utf-8'):
""" :param save_name: The name of the file saved after merging , User input required :param file_dir: Need to merge csv File folder :param csv_encoding: csv File encoding , Default utf-8 :return: None """
# File path saved after merging = You need to merge the folder where the file is located + The name of the merged file
self.save_path = os.path.join(file_dir, save_name)
self.__check_name()
# Specified encoding
self.encoding = csv_encoding
# Need to merge csv File folder
self.file_dir = file_dir
self.__check_dir_exist(self.file_dir)
# File path list
self.file_list = [os.path.join(self.file_dir, i) for i in os.listdir(self.file_dir)]
self.__check_singal_dir(self.file_list)
# Merge into the specified file
print(" Began to merge csv file !")
for file in self.file_list:
df = pd.read_csv(file, encoding=self.encoding)
df.to_csv(self.save_path, index=False, quoting=1, header=not os.path.exists(self.save_path), mode='a')
print(f"{
file} Has been merged into {
self.save_path} !")
print(" All files have been merged !")
def split_csv(self, csv_path, save_dir, split_line=100000, csv_encoding='utf-8'):
""" Split the file and get csv file information . :param csv_path: csv File path :param save_dir: Save path of segmentation file :param split_line: Divide according to the number of lines , The default is 10 ten thousand :param csv_encoding: csv File encoding format :return: None """
# Pass in csv File path and small after segmentation csv Save path of file
self.csv_path = csv_path
self.save_dir = save_dir
# testing csv Whether the file path and save path conform to the specification
self.__check_dir_exist(self.save_dir)
self.__check_file_exist(self.csv_path)
# Set the encoding format
self.encoding = csv_encoding
# according to split_line That's ok , Segmentation
self.split_line = split_line
print(" Splitting files ... ")
# Get file size
self.file_size = round(os.path.getsize(self.csv_path) / 1024 / 1024, 2)
# Get the number of data rows
self.line_numbers = 0
# The suffix of the file after segmentation
i = 0
# df generator , Each element is a df,df The number of lines is split_line, Default 100000 That's ok
df_iter = pd.read_csv(self.csv_path,
chunksize=self.split_line,
encoding=self.encoding)
# Generate one at a time df, Until all the data is retrieved
for df in df_iter:
# Suffix from 1 Start
i += 1
# Total rows of statistical data
self.line_numbers += df.shape[0]
# Set the save path of the file after segmentation
save_filename = os.path.join(self.save_dir, self.filename + "_" + str(i) + self.extension)
# Print and save information
print(f"{
save_filename} Generated !")
# Save the number after segmentation
df.to_csv(save_filename, index=False, encoding='utf-8', quoting=1)
# Get data column name
self.column_names = pd.read_csv(self.csv_path, nrows=10).columns.tolist()
print(" The segmentation is finished !")
return None
def __check_dir_exist(self, dirpath):
""" test save_dir Whether there is , If it does not exist, create the folder . :return: None """
if not os.path.exists(dirpath):
raise FileNotFoundError(f'{
dirpath} directory does not exist , Please check !')
if not os.path.isdir(dirpath):
raise TypeError(f'{
dirpath} The destination path is not a folder , Please check !')
def __check_file_exist(self, csv_path):
""" test csv_path Whether it is CSV file . :return: None """
if not os.path.exists(csv_path):
raise FileNotFoundError(f'{
csv_path} file does not exist , Please check the file path !')
if not os.path.isfile(csv_path):
raise TypeError(f'{
csv_path} The path is not in file format , Please check !')
# File existence path
self.file_path_root = os.path.split(csv_path)[0]
# File name
self.filename = os.path.split(csv_path)[1].replace('.csv', '').replace('.CSV', '')
# file extension
self.extension = os.path.splitext(csv_path)[1]
if self.extension.upper() != '.CSV':
raise TypeError(f'{
csv_path} Wrong file type , Not CSV file type , Please check !')
def __check_name(self):
""" Check whether the file name .csv ending :return: """
if not self.save_path.upper().endswith('.CSV'):
raise TypeError(' File name setting error ')
def __check_singal_dir(self, file_list):
""" Check what needs to be merged csv Whether the folder where the file is located meets the requirements . 1. There should be no division csv Documents other than documents 2. There should be no folder . :return: """
for file in file_list:
if os.path.isdir(file):
raise EnvironmentError(f' Found folder {
file}, There are other folders in the current folder , Please check !')
if not file.upper().endswith('.CSV'):
raise EnvironmentError(f' Non discovery CSV file :{
file}, Please make sure that the current folder only stores csv file !')
if __name__ == '__main__':
# Test segmentation
csv_path = r'E:\simple.csv'
save_dir = r'E:\simple_splited_files'
PyCSV().split_csv(csv_path, save_dir, split_line=10000)
# Test merge
files_dir = r'E:\simple_splited_files'
save_name = r'merge_simple.csv'
PyCSV().merge_csv(save_name, files_dir)
https://zhuanlan.zhihu.com/p/431104537
边栏推荐
- leetcode_ Power of Four
- redis单线程问题强制梳理门外汉扫盲
- Automatic generation of client code from flask server code -- Introduction to flask native stubs Library
- Detailed comments on MapReduce instance code on the official website
- 求字符串函数和长度不受限制的字符串函数的详解
- Characteristics of MySQL InnoDB storage engine -- Analysis of row lock
- Under VC, Unicode and ANSI are converted to each other, cstringw and std:: string are converted to each other
- WinDbg analysis dump file
- Unity功能——Unity离线文档下载及使用
- PyTorch crop images differentiablly
猜你喜欢
Tensorflow realizes verification code recognition (III)
百度智能云助力石嘴山市升级“互联网+养老服务”智慧康养新模式
qt使用QZxing生成二维码
Kubernetes带你从头到尾捋一遍
Visual upper system design and development (Halcon WinForm) -5 camera
Jvm-03-runtime data area PC, stack, local method stack
Tensorflow realizes verification code recognition (I)
Kubernetes vous emmène du début à la fin
秒杀系统3-商品列表和商品详情
找映射关系
随机推荐
在MapReduce中利用MultipleOutputs输出多个文件
Analysis of development mode process based on SVN branch
Jvm-04-runtime data area heap, method area
leetcode_ Power of Four
视觉上位系统设计开发(halcon-winform)-2.全局变量设计
Use of Tex editor
Markdown file titles are all reduced by one level
Halcon and WinForm study section 1
Win10 enterprise 2016 long term service activation tutorial
Nppexec get process return code
[cloud native training camp] module VIII kubernetes life cycle management and service discovery
Enable multi-threaded download of chrome and edge browsers
Unity function - unity offline document download and use
Halcon与Winform学习第一节
Introduction, use and principle of synchronized
如何使用 @NotNull等注解校验 并全局异常处理
Calibre LVL
《微服务设计》读书笔记(下)
Vs2017 is driven by IP debugging (dual machine debugging)
Kubernetes will show you from beginning to end