Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions basic_dedup/write_meta_data_pkl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
import hashlib
import pandas as pd
import argparse
import time

# 计算文件的 SHA256 哈希值
def sha256(filename):
hash_sha256 = hashlib.sha256()
with open(filename, 'rb') as f:
content = f.read()
return hashlib.sha256(content).hexdigest()
for chunk in iter(lambda: f.read(1024*1024*256), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()

# 递归遍历目录并输出文件路径、文件大小和 SHA256 哈希值
def get_all_files_list(dir_path):
file_path_list = []
for root, _, files in os.walk(dir_path):
#print(files)
for file in files:
file_path = os.path.join(root, file)
file_path_list.append(file_path)
Expand All @@ -29,6 +33,7 @@ def write_to_csv(dir_path, pkl_file='files.pkl'):

data = {'File': [], 'Size': [], 'SHA256': []}
file_path_set = set(get_all_files_list(dir_path))
#print(file_path_set)

file_path_set -= set(existing_df['File'])

Expand All @@ -40,8 +45,8 @@ def write_to_csv(dir_path, pkl_file='files.pkl'):
data['Size'].append(file_size)
data['SHA256'].append(file_sha256)
except:
print('file not exist: {}'.format(filepath))

print("file not exist:" + filepath)
df = pd.concat([existing_df, pd.DataFrame(data)], ignore_index=True)

# 将 DataFrame 写入 pickle 文件
Expand Down