modelarts_data

从桶中上传到notebook

import boto3

def download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key, endpoint_url='https://obs.cn-northeast-227.dlaicc.com'):
    """
    从华为云 OBS 下载文件到本地
    :param obs_path: OBS 对象（文件）路径，包括桶名和对象路径
    :param local_file_path: 下载到本地的文件路径
    :param aws_access_key: 华为云 Access Key
    :param aws_secret_key: 华为云 Secret Key
    :param endpoint_url: OBS endpoint URL，默认是华为云华北-北京一区
    :return: 成功返回 True，失败返回 False
    """
    try:
        # 提取桶名和对象路径
        bucket_name, object_path = obs_path.split('://')[1].split('/', 1)
        
        # 创建 OBS 客户端
        s3 = boto3.client('s3', endpoint_url=endpoint_url, aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

        # 下载文件
        s3.download_file(bucket_name, object_path, local_file_path)

        print(f"文件 '{object_path}' 已成功下载到本地路径 '{local_file_path}'")
        return True
    except Exception as e:
        print(f"文件下载失败：{e}")
        return False
# 替换为您的实际值val1 2
obs_path = 'obs://lijiaqi/dataset/image_text/ccs_synthetic_filtered_large/images300_1.zip'
local_file_path = '/home/ma-user/work/dataset/images300_1.zip'
aws_access_key = 'UODTG5O4J7WUJALMVS1B'
aws_secret_key = 'agzvTirhdyutXDFuMxszACN47uGrSqprz5eeK5Pw'
# for i in range(1,3):
#     download_from_obs(obs_path+str(i)+'.zip', local_file_path+str(i)+'.zip', aws_access_key, aws_secret_key)
# 下载文件到 Jupyter Notebook
download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key)

从notebook上传到桶中

import boto3

def download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key, endpoint_url='https://obs.cn-northeast-227.dlaicc.com'):
    """
    从华为云 OBS 下载文件到本地
    :param obs_path: OBS 对象（文件）路径，包括桶名和对象路径
    :param local_file_path: 下载到本地的文件路径
    :param aws_access_key: 华为云 Access Key
    :param aws_secret_key: 华为云 Secret Key
    :param endpoint_url: OBS endpoint URL，默认是华为云华北-北京一区
    :return: 成功返回 True，失败返回 False
    """
    try:
        # 提取桶名和对象路径
        bucket_name, object_path = obs_path.split('://')[1].split('/', 1)
        print(bucket_name,object_path)
        
        # 创建 OBS 客户端
        s3 = boto3.client('s3', endpoint_url=endpoint_url, aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

        # 下载文件
        s3.upload_file(local_file_path, bucket_name, object_path)

        print(f"文件 '{object_path}' 已成功下载到本地路径 '{local_file_path}'")
        return True
    except Exception as e:
        print(f"文件下载失败：{e}")
        return False
# 替换为您的实际值val1 2
obs_path = 'obs://lijiaqi/dataset/stage2_without.ckpt'
local_file_path = '/home/ma-user/work/train/clip/weight/stage2/withoutloss/stage2_0.ckpt'
aws_access_key = 'UODTG5O4J7WUJALMVS1B'
aws_secret_key = 'agzvTirhdyutXDFuMxszACN47uGrSqprz5eeK5Pw'
# for i in range(1,3):
#     download_from_obs(obs_path+str(i)+'.zip', local_file_path+str(i)+'.zip', aws_access_key, aws_secret_key)
# 下载文件到 Jupyter Notebook
download_from_obs(obs_path, local_file_path, aws_access_key, aws_secret_key)

过滤数据

import re
import json


annotations_path = 'json_data/three.json'
all_new = json.load(open(annotations_path, 'r', encoding='utf-8'))
# 过滤字典
filtered_data_list = []
for item in all_new:
    caption = item['caption']
    # 使用正则表达式来匹配单词和标点符号
    words = re.findall(r'\b\w+\b', caption)
    punctuation = re.findall(r'[./<>~`@#%^*=+-_,!?:();&]', caption)
    word_count = len(words)
    punctuation_count = len(punctuation)  # 统计标点数量
    total_count = word_count + punctuation_count
    if total_count > 70:
        print(caption)
    if total_count <= 70:
        filtered_data_list.append(item)
# 将列表转换为 JSON 格式的字符串
json_data = json.dumps(filtered_data_list)

# 将 JSON 字符串写入文件
with open('three_filter.json', 'w') as f:
    f.write(json_data)