从桶中上传到notebook

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import boto3

def download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key, endpoint_url='https://obs.cn-northeast-227.dlaicc.com'):
"""
从华为云 OBS 下载文件到本地
:param obs_path: OBS 对象(文件)路径,包括桶名和对象路径
:param local_file_path: 下载到本地的文件路径
:param aws_access_key: 华为云 Access Key
:param aws_secret_key: 华为云 Secret Key
:param endpoint_url: OBS endpoint URL,默认是华为云华北-北京一区
:return: 成功返回 True,失败返回 False
"""
try:
# 提取桶名和对象路径
bucket_name, object_path = obs_path.split('://')[1].split('/', 1)

# 创建 OBS 客户端
s3 = boto3.client('s3', endpoint_url=endpoint_url, aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

# 下载文件
s3.download_file(bucket_name, object_path, local_file_path)

print(f"文件 '{object_path}' 已成功下载到本地路径 '{local_file_path}'")
return True
except Exception as e:
print(f"文件下载失败:{e}")
return False
# 替换为您的实际值val1 2
obs_path = 'obs://lijiaqi/dataset/image_text/ccs_synthetic_filtered_large/images300_1.zip'
local_file_path = '/home/ma-user/work/dataset/images300_1.zip'
aws_access_key = 'UODTG5O4J7WUJALMVS1B'
aws_secret_key = 'agzvTirhdyutXDFuMxszACN47uGrSqprz5eeK5Pw'
# for i in range(1,3):
# download_from_obs(obs_path+str(i)+'.zip', local_file_path+str(i)+'.zip', aws_access_key, aws_secret_key)
# 下载文件到 Jupyter Notebook
download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key)

从notebook上传到桶中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import boto3

def download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key, endpoint_url='https://obs.cn-northeast-227.dlaicc.com'):
"""
从华为云 OBS 下载文件到本地
:param obs_path: OBS 对象(文件)路径,包括桶名和对象路径
:param local_file_path: 下载到本地的文件路径
:param aws_access_key: 华为云 Access Key
:param aws_secret_key: 华为云 Secret Key
:param endpoint_url: OBS endpoint URL,默认是华为云华北-北京一区
:return: 成功返回 True,失败返回 False
"""
try:
# 提取桶名和对象路径
bucket_name, object_path = obs_path.split('://')[1].split('/', 1)
print(bucket_name,object_path)

# 创建 OBS 客户端
s3 = boto3.client('s3', endpoint_url=endpoint_url, aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

# 下载文件
s3.upload_file(local_file_path, bucket_name, object_path)

print(f"文件 '{object_path}' 已成功下载到本地路径 '{local_file_path}'")
return True
except Exception as e:
print(f"文件下载失败:{e}")
return False
# 替换为您的实际值val1 2
obs_path = 'obs://lijiaqi/dataset/stage2_without.ckpt'
local_file_path = '/home/ma-user/work/train/clip/weight/stage2/withoutloss/stage2_0.ckpt'
aws_access_key = 'UODTG5O4J7WUJALMVS1B'
aws_secret_key = 'agzvTirhdyutXDFuMxszACN47uGrSqprz5eeK5Pw'
# for i in range(1,3):
# download_from_obs(obs_path+str(i)+'.zip', local_file_path+str(i)+'.zip', aws_access_key, aws_secret_key)
# 下载文件到 Jupyter Notebook
download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key)

过滤数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import re
import json


annotations_path = 'json_data/three.json'
all_new = json.load(open(annotations_path, 'r', encoding='utf-8'))
# 过滤字典
filtered_data_list = []
for item in all_new:
caption = item['caption']
# 使用正则表达式来匹配单词和标点符号
words = re.findall(r'\b\w+\b', caption)
punctuation = re.findall(r'[./<>~`@#%^*=+-_,!?:();&]', caption)
word_count = len(words)
punctuation_count = len(punctuation) # 统计标点数量
total_count = word_count + punctuation_count
if total_count > 70:
print(caption)
if total_count <= 70:
filtered_data_list.append(item)
# 将列表转换为 JSON 格式的字符串
json_data = json.dumps(filtered_data_list)

# 将 JSON 字符串写入文件
with open('three_filter.json', 'w') as f:
f.write(json_data)