Datawhale智能汽车多模态(1)--读懂Baseline


赛题:智能驾驶汽车虚拟仿真视频数据理解赛道

赛事链接

基础思路:使用文本与图像进行匹配

Baseline

import paddle
from PIL import Image
from clip import tokenize, load_model
import glob, json, os
import cv2
from PIL import Image
from tqdm import tqdm_notebook
import numpy as np
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

model, transforms = load_model('ViT_B_32', pretrained=True)

en_match_words = {
"scerario" : ["suburbs","city street","expressway","tunnel","parking-lot","gas or charging stations","unknown"],
"weather" : ["clear","cloudy","raining","foggy","snowy","unknown"],
"period" : ["daytime","dawn or dusk","night","unknown"],
"road_structure" : ["normal","crossroads","T-junction","ramp","lane merging","parking lot entrance","round about","unknown"],
"general_obstacle" : ["nothing","speed bumper","traffic cone","water horse","stone","manhole cover","nothing","unknown"],
"abnormal_condition" : ["uneven","oil or water stain","standing water","cracked","nothing","unknown"],
"ego_car_behavior" : ["slow down","go straight","turn right","turn left","stop","U-turn","speed up","lane change","others"],
"closest_participants_type" : ["passenger car","bus","truck","pedestrain","policeman","nothing","others","unknown"],
"closest_participants_behavior" : ["slow down","go straight","turn right","turn left","stop","U-turn","speed up","lane change","others"],
}

submit_json = {
    "author" : "abc" ,
    "time" : "231011",
    "model" : "model_name",
    "test_results" : []
}

paths = glob.glob('./初赛测试视频/*')
paths.sort()

for video_path in paths:
    print(video_path)
    
    clip_id = video_path.split('/')[-1]
    cap = cv2.VideoCapture(video_path)
    img = cap.read()[1]
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image = Image.fromarray(image)
    image = transforms(image).unsqueeze(0)

    single_video_result = {
        "clip_id": clip_id,
        "scerario" : "cityroad",
        "weather":"unknown",
        "period":"night",
        "road_structure":"ramp",
        "general_obstacle":"nothing",
        "abnormal_condition":"nothing",
        "ego_car_behavior":"turning right",
        "closest_participants_type":"passenger car",
        "closest_participants_behavior":"braking"
    }
    
    for keyword in en_match_words.keys():
        if keyword not in ["weather", "road_structure"]:
            continue
            
        texts = np.array(en_match_words[keyword])

        with paddle.no_grad():
            logits_per_image, logits_per_text = model(image, tokenize(en_match_words[keyword]))
            probs = paddle.nn.functional.softmax(logits_per_image, axis=-1)

        probs = probs.numpy()        
        single_video_result[keyword] = texts[probs[0].argsort()[::-1][0]]
        
    submit_json["test_results"].append(single_video_result)
    
with open('clip_result.json', 'w', encoding='utf-8') as up:
    json.dump(submit_json, up, ensure_ascii=False)

读懂baseline

导入必要的库

import paddle
from PIL import Image
from clip import tokenize, load_model
import glob, json, os
import cv2
from PIL import Image
from tqdm import tqdm_notebook #可视化
import numpy as np
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

加载预训练模型

model, transforms = load_model('ViT_B_32', pretrained=True)

创建字典变量用于输入文本信息

en_match_words = {
"scerario" : ["suburbs","city street","expressway","tunnel","parking-lot","gas or charging stations","unknown"],
"weather" : ["clear","cloudy","raining","foggy","snowy","unknown"],
"period" : ["daytime","dawn or dusk","night","unknown"],
"road_structure" : ["normal","crossroads","T-junction","ramp","lane merging","parking lot entrance","round about","unknown"],
"general_obstacle" : ["nothing","speed bumper","traffic cone","water horse","stone","manhole cover","nothing","unknown"],
"abnormal_condition" : ["uneven","oil or water stain","standing water","cracked","nothing","unknown"],
"ego_car_behavior" : ["slow down","go straight","turn right","turn left","stop","U-turn","speed up","lane change","others"],
"closest_participants_type" : ["passenger car","bus","truck","pedestrain","policeman","nothing","others","unknown"],
"closest_participants_behavior" : ["slow down","go straight","turn right","turn left","stop","U-turn","speed up","lane change","others"],
}

定义输出json文件的基本内容的字典

submit_json = {
    "author" : "abc" ,
    "time" : "231011",
    "model" : "model_name",
    "test_results" : []
}

获取视频地址

paths = glob.glob('./初赛测试视频/*')
paths.sort()

接下来核心部分是一个循环,对给定的视频路径列表进行迭代,并依次处理每个视频。

具体分析内容在注释部分

for video_path in paths:
    print(video_path)
    
    clip_id = video_path.split('/')[-1] #使用`split()`方法和`'/'`作为分隔符将视频路径分割为多个部分,并选择最后一个部分作为`clip_id`(视频片段ID)
    cap = cv2.VideoCapture(video_path)#使用OpenCV的`cv2.VideoCapture()`函数打开视频文件
    img = cap.read()[1]#读取视频的一帧图像,并将其保存在`img`变量中
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)#将图像从BGR颜色空间转换为RGB颜色空间。
    image = Image.fromarray(image)#将图像数据转换为PIL图像对象
    image = transforms(image).unsqueeze(0)#添加一个维度,以符合模型的输入要求(张量)
    
    #用于存储视频处理的结果
    single_video_result = {
        "clip_id": clip_id,
        "scerario" : "cityroad",
        "weather":"unknown",
        "period":"night",
        "road_structure":"ramp",
        "general_obstacle":"nothing",
        "abnormal_condition":"nothing",
        "ego_car_behavior":"turning right",
        "closest_participants_type":"passenger car",
        "closest_participants_behavior":"braking"
    }
    
    for keyword in en_match_words.keys():
        if keyword not in ["weather", "road_structure"]: #只处理天气和时间段(其它效果不行)
            continue
            
        texts = np.array(en_match_words[keyword])#关键字对应的文本列表转换为NumPy数组

        with paddle.no_grad():
            logits_per_image, logits_per_text = model(image, tokenize(en_match_words[keyword]))  #模型对图像和文本进行推断
            probs = paddle.nn.functional.softmax(logits_per_image, axis=-1)#得到每个类别的概率分布

        probs = probs.numpy()   #概率分布转换为NumPy数组     
        #argsort()获得升序索引,[::-1]进行降序排序,[0]获得最大值的索引
        #选择概率最高的类别对应的文本
        single_video_result[keyword] = texts[probs[0].argsort()[::-1][0]]
        
    submit_json["test_results"].append(single_video_result)#推理结果添加

最后结果写入clip_result.json

‘w’写入模式,并指定编码encoding为UTF-8,不会进行ASCII编码ensure_ascii=False(即确保写入的文件中可以包含非ASCII字符)

with open('clip_result.json', 'w', encoding='utf-8') as up:
    json.dump(submit_json, up, ensure_ascii=False)

提分(df)思路

对视频每秒取5帧,而不是只取一帧

更换模型ViT-L/14

换成COLOR_BGR2GRAY

输入文本的描述语言,而不是输入描述性单词,eg:”unknown”改为输入“In the parking lot, there’s no way to know if it’s day or night”

太菜了想不出来了….


Author: 寒风渐微凉
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint policy. If reproduced, please indicate source 寒风渐微凉 !
 Previous
Next 
  TOC