From 75d0945ca424d3d63980de3531b0be12e7def560 Mon Sep 17 00:00:00 2001 From: Jaeyong Kang Date: Wed, 25 Oct 2023 20:02:09 +0800 Subject: [PATCH] dataset add py --- dataset/vevo_dataset.py | 862 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 862 insertions(+) create mode 100644 dataset/vevo_dataset.py diff --git a/dataset/vevo_dataset.py b/dataset/vevo_dataset.py new file mode 100644 index 0000000..3f2eb0a --- /dev/null +++ b/dataset/vevo_dataset.py @@ -0,0 +1,862 @@ +import os +import pickle +import random +import torch +import torch.nn as nn +import numpy as np + +from torch.utils.data import Dataset +from utilities.constants import * +from utilities.device import cpu_device +from utilities.device import get_device + +import json + +SEQUENCE_START = 0 + +class VevoDataset(Dataset): + def __init__(self, dataset_root = "./dataset/", split="train", split_ver="v1", vis_models="2d/clip_l14p", emo_model="6c_l14p", max_seq_chord=300, max_seq_video=300, random_seq=True, is_video = True): + + self.dataset_root = dataset_root + + self.vevo_chord_root = os.path.join( dataset_root, "vevo_chord", "lab_v2_norm", "all") + self.vevo_emotion_root = os.path.join( dataset_root, "vevo_emotion", emo_model, "all") + self.vevo_motion_root = os.path.join( dataset_root, "vevo_motion", "all") + self.vevo_scene_offset_root = os.path.join( dataset_root, "vevo_scene_offset", "all") + self.vevo_meta_split_path = os.path.join( dataset_root, "vevo_meta", "split", split_ver, split + ".txt") + + self.vevo_loudness_root = os.path.join( dataset_root, "vevo_loudness", "all") + self.vevo_note_density_root = os.path.join( dataset_root, "vevo_note_density", "all") + + self.max_seq_video = max_seq_video + self.max_seq_chord = max_seq_chord + self.random_seq = random_seq + self.is_video = is_video + + self.vis_models_arr = vis_models.split(" ") + self.vevo_semantic_root_list = [] + self.id_list = [] + + self.emo_model = emo_model + + if IS_VIDEO: + for i in range( len(self.vis_models_arr) ): + p1 = self.vis_models_arr[i].split("/")[0] + p2 = self.vis_models_arr[i].split("/")[1] + vevo_semantic_root = os.path.join(dataset_root, "vevo_semantic" , "all" , p1, p2) + self.vevo_semantic_root_list.append( vevo_semantic_root ) + + with open( self.vevo_meta_split_path ) as f: + for line in f: + self.id_list.append(line.strip()) + + self.data_files_chord = [] + self.data_files_emotion = [] + self.data_files_motion = [] + self.data_files_scene_offset = [] + self.data_files_semantic_list = [] + + self.data_files_loudness = [] + self.data_files_note_density = [] + + for i in range(len(self.vis_models_arr)): + self.data_files_semantic_list.append([]) + + for fid in self.id_list: + fpath_chord = os.path.join( self.vevo_chord_root, fid + ".lab" ) + fpath_emotion = os.path.join( self.vevo_emotion_root, fid + ".lab" ) + fpath_motion = os.path.join( self.vevo_motion_root, fid + ".lab" ) + fpath_scene_offset = os.path.join( self.vevo_scene_offset_root, fid + ".lab" ) + + fpath_loudness = os.path.join( self.vevo_loudness_root, fid + ".lab" ) + fpath_note_density = os.path.join( self.vevo_note_density_root, fid + ".lab" ) + + fpath_semantic_list = [] + for vevo_semantic_root in self.vevo_semantic_root_list: + fpath_semantic = os.path.join( vevo_semantic_root, fid + ".npy" ) + fpath_semantic_list.append(fpath_semantic) + + checkFile_semantic = True + for fpath_semantic in fpath_semantic_list: + if not os.path.exists(fpath_semantic): + checkFile_semantic = False + + checkFile_chord = os.path.exists(fpath_chord) + checkFile_emotion = os.path.exists(fpath_emotion) + checkFile_motion = os.path.exists(fpath_motion) + checkFile_scene_offset = os.path.exists(fpath_scene_offset) + + checkFile_loudness = os.path.exists(fpath_loudness) + checkFile_note_density = os.path.exists(fpath_note_density) + + if checkFile_chord and checkFile_emotion and checkFile_motion \ + and checkFile_scene_offset and checkFile_semantic and checkFile_loudness and checkFile_note_density : + + self.data_files_chord.append(fpath_chord) + self.data_files_emotion.append(fpath_emotion) + self.data_files_motion.append(fpath_motion) + self.data_files_scene_offset.append(fpath_scene_offset) + + self.data_files_loudness.append(fpath_loudness) + self.data_files_note_density.append(fpath_note_density) + + if IS_VIDEO: + for i in range(len(self.vis_models_arr)): + self.data_files_semantic_list[i].append( fpath_semantic_list[i] ) + + chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json") + + chordRootDicPath = os.path.join( dataset_root, "vevo_meta/chord_root.json") + chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json") + + with open(chordDicPath) as json_file: + self.chordDic = json.load(json_file) + + with open(chordRootDicPath) as json_file: + self.chordRootDic = json.load(json_file) + + with open(chordAttrDicPath) as json_file: + self.chordAttrDic = json.load(json_file) + + def __len__(self): + return len(self.data_files_chord) + + def __getitem__(self, idx): + #### ---- CHORD ----- #### + feature_chord = np.empty(self.max_seq_chord) + feature_chord.fill(CHORD_PAD) + + feature_chordRoot = np.empty(self.max_seq_chord) + feature_chordRoot.fill(CHORD_ROOT_PAD) + feature_chordAttr = np.empty(self.max_seq_chord) + feature_chordAttr.fill(CHORD_ATTR_PAD) + + key = "" + with open(self.data_files_chord[idx], encoding = 'utf-8') as f: + for line in f: + line = line.strip() + line_arr = line.split(" ") + if line_arr[0] == "key": + key = line_arr[1] + " "+ line_arr[2] + continue + time = line_arr[0] + time = int(time) + if time >= self.max_seq_chord: + break + chord = line_arr[1] + chordID = self.chordDic[chord] + feature_chord[time] = chordID + chord_arr = chord.split(":") + + if len(chord_arr) == 1: + if chord_arr[0] == "N": + chordRootID = self.chordRootDic["N"] + chordAttrID = self.chordAttrDic["N"] + feature_chordRoot[time] = chordRootID + feature_chordAttr[time] = chordAttrID + else: + chordRootID = self.chordRootDic[chord_arr[0]] + feature_chordRoot[time] = chordRootID + feature_chordAttr[time] = 1 + elif len(chord_arr) == 2: + chordRootID = self.chordRootDic[chord_arr[0]] + chordAttrID = self.chordAttrDic[chord_arr[1]] + feature_chordRoot[time] = chordRootID + feature_chordAttr[time] = chordAttrID + + if "major" in key: + feature_key = torch.tensor([0]) + else: + feature_key = torch.tensor([1]) + + feature_chord = torch.from_numpy(feature_chord) + feature_chord = feature_chord.to(torch.long) + + feature_chordRoot = torch.from_numpy(feature_chordRoot) + feature_chordRoot = feature_chordRoot.to(torch.long) + + feature_chordAttr = torch.from_numpy(feature_chordAttr) + feature_chordAttr = feature_chordAttr.to(torch.long) + + feature_key = feature_key.float() + + x = feature_chord[:self.max_seq_chord-1] + tgt = feature_chord[1:self.max_seq_chord] + + x_root = feature_chordRoot[:self.max_seq_chord-1] + tgt_root = feature_chordRoot[1:self.max_seq_chord] + x_attr = feature_chordAttr[:self.max_seq_chord-1] + tgt_attr = feature_chordAttr[1:self.max_seq_chord] + + if time < self.max_seq_chord: + tgt[time] = CHORD_END + tgt_root[time] = CHORD_ROOT_END + tgt_attr[time] = CHORD_ATTR_END + + #### ---- SCENE OFFSET ----- #### + feature_scene_offset = np.empty(self.max_seq_video) + feature_scene_offset.fill(SCENE_OFFSET_PAD) + with open(self.data_files_scene_offset[idx], encoding = 'utf-8') as f: + for line in f: + line = line.strip() + line_arr = line.split(" ") + time = line_arr[0] + time = int(time) + if time >= self.max_seq_chord: + break + sceneID = line_arr[1] + feature_scene_offset[time] = int(sceneID)+1 + + feature_scene_offset = torch.from_numpy(feature_scene_offset) + feature_scene_offset = feature_scene_offset.to(torch.float32) + + #### ---- MOTION ----- #### + feature_motion = np.empty(self.max_seq_video) + feature_motion.fill(MOTION_PAD) + with open(self.data_files_motion[idx], encoding = 'utf-8') as f: + for line in f: + line = line.strip() + line_arr = line.split(" ") + time = line_arr[0] + time = int(time) + if time >= self.max_seq_chord: + break + motion = line_arr[1] + feature_motion[time] = float(motion) + + feature_motion = torch.from_numpy(feature_motion) + feature_motion = feature_motion.to(torch.float32) + + #### ---- NOTE_DENSITY ----- #### + feature_note_density = np.empty(self.max_seq_video) + feature_note_density.fill(NOTE_DENSITY_PAD) + with open(self.data_files_note_density[idx], encoding = 'utf-8') as f: + for line in f: + line = line.strip() + line_arr = line.split(" ") + time = line_arr[0] + time = int(time) + if time >= self.max_seq_chord: + break + note_density = line_arr[1] + feature_note_density[time] = float(note_density) + + feature_note_density = torch.from_numpy(feature_note_density) + feature_note_density = feature_note_density.to(torch.float32) + + #### ---- LOUDNESS ----- #### + feature_loudness = np.empty(self.max_seq_video) + feature_loudness.fill(LOUDNESS_PAD) + with open(self.data_files_loudness[idx], encoding = 'utf-8') as f: + for line in f: + line = line.strip() + line_arr = line.split(" ") + time = line_arr[0] + time = int(time) + if time >= self.max_seq_chord: + break + loudness = line_arr[1] + feature_loudness[time] = float(loudness) + + feature_loudness = torch.from_numpy(feature_loudness) + feature_loudness = feature_loudness.to(torch.float32) + + #### ---- EMOTION ----- #### + if self.emo_model.startswith("6c"): + feature_emotion = np.empty( (self.max_seq_video, 6)) + else: + feature_emotion = np.empty( (self.max_seq_video, 5)) + + feature_emotion.fill(EMOTION_PAD) + with open(self.data_files_emotion[idx], encoding = 'utf-8') as f: + for line in f: + line = line.strip() + line_arr = line.split(" ") + if line_arr[0] == "time": + continue + time = line_arr[0] + time = int(time) + if time >= self.max_seq_chord: + break + + if len(line_arr) == 7: + emo1, emo2, emo3, emo4, emo5, emo6 = \ + line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5],line_arr[6] + emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5), float(emo6) ] + elif len(line_arr) == 6: + emo1, emo2, emo3, emo4, emo5 = \ + line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5] + emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5) ] + + emoList = np.array(emoList) + feature_emotion[time] = emoList + + feature_emotion = torch.from_numpy(feature_emotion) + feature_emotion = feature_emotion.to(torch.float32) + + feature_emotion_argmax = torch.argmax(feature_emotion, dim=1) + _, max_prob_indices = torch.max(feature_emotion, dim=1) + max_prob_values = torch.gather(feature_emotion, dim=1, index=max_prob_indices.unsqueeze(1)) + max_prob_values = max_prob_values.squeeze() + + # -- emotion to chord + # maj dim sus4 min7 min sus2 aug dim7 maj6 hdim7 7 min6 maj7 + # 0. extcing : [1,0,1,0,0,0,0,0,0,0,1,0,0] + # 1. fearful : [0,1,0,1,0,0,0,1,0,1,0,0,0] + # 2. tense : [0,1,1,1,0,0,0,0,0,0,1,0,0] + # 3. sad : [0,0,0,1,1,1,0,0,0,0,0,0,0] + # 4. relaxing: [1,0,0,0,0,0,0,0,1,0,0,0,1] + # 5. neutral : [0,0,0,0,0,0,0,0,0,0,0,0,0] + + a0 = [0]+[1,0,1,0,0,0,0,0,0,0,1,0,0]*12+[0,0] + a1 = [0]+[0,1,0,1,0,0,0,1,0,1,0,0,0]*12+[0,0] + a2 = [0]+[0,1,1,1,0,0,0,0,0,0,1,0,0]*12+[0,0] + a3 = [0]+[0,0,0,1,1,1,0,0,0,0,0,0,0]*12+[0,0] + a4 = [0]+[1,0,0,0,0,0,0,0,1,0,0,0,1]*12+[0,0] + a5 = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[0,0] + + aend = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[1,0] + apad = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[0,1] + + # b0 = [0]+[1,0,1,0,0,0,0,0,0,0,1,0,0]+[0,0] + # b1 = [0]+[0,1,0,1,0,0,0,1,0,1,0,0,0]+[0,0] + # a2 = [0]+[0,1,1,1,0,0,0,0,0,0,1,0,0]+[0,0] + # a3 = [0]+[0,0,0,1,1,1,0,0,0,0,0,0,0]+[0,0] + # a4 = [0]+[1,0,0,0,0,0,0,0,1,0,0,0,1]+[0,0] + # a5 = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]+[0,0] + # aend = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]+[1,0] + # apad = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]+[0,1] + + a0_tensor = torch.tensor(a0) + a1_tensor = torch.tensor(a1) + a2_tensor = torch.tensor(a2) + a3_tensor = torch.tensor(a3) + a4_tensor = torch.tensor(a4) + a5_tensor = torch.tensor(a5) + + aend_tensor = torch.tensor(aend) + apad_tensor = torch.tensor(apad) + + mapped_tensor = torch.zeros((300, 159)) + for i, val in enumerate(feature_emotion_argmax): + if feature_chord[i] == CHORD_PAD: + mapped_tensor[i] = apad_tensor + elif feature_chord[i] == CHORD_END: + mapped_tensor[i] = aend_tensor + elif val == 0: + mapped_tensor[i] = a0_tensor + elif val == 1: + mapped_tensor[i] = a1_tensor + elif val == 2: + mapped_tensor[i] = a2_tensor + elif val == 3: + mapped_tensor[i] = a3_tensor + elif val == 4: + mapped_tensor[i] = a4_tensor + elif val == 5: + mapped_tensor[i] = a5_tensor + + # feature emotion : [1, 300, 6] + # y : [299, 159] + # tgt : [299] + # tgt_emo : [299, 159] + # tgt_emo_prob : [299] + + tgt_emotion = mapped_tensor[1:] + tgt_emotion_prob = max_prob_values[1:] + + + + + + #### ---- SEMANTIC ----- #### + + feature_semantic_list = [] + if self.is_video: + for i in range( len(self.vis_models_arr) ): + video_feature = np.load(self.data_files_semantic_list[i][idx]) + dim_vf = video_feature.shape[1] # 2048 + video_feature_tensor = torch.from_numpy( video_feature ) + + feature_semantic = torch.full((self.max_seq_video, dim_vf,), SEMANTIC_PAD , dtype=torch.float32, device=cpu_device()) + if(video_feature_tensor.shape[0] < self.max_seq_video): + feature_semantic[:video_feature_tensor.shape[0]] = video_feature_tensor + else: + feature_semantic = video_feature_tensor[:self.max_seq_video] + feature_semantic_list.append(feature_semantic) + + #return x, tgt, feature_semantic_list, feature_key, feature_scene_offset + return { "x":x, + "tgt":tgt, + "x_root":x_root, + "tgt_root":tgt_root, + "x_attr":x_attr, + "tgt_attr":tgt_attr, + "semanticList": feature_semantic_list, + "key": feature_key, + "scene_offset": feature_scene_offset, + "motion": feature_motion, + "emotion": feature_emotion, + "tgt_emotion" : tgt_emotion, + "tgt_emotion_prob" : tgt_emotion_prob, + "note_density" : feature_note_density, + "loudness" : feature_loudness + } + +def create_vevo_datasets(dataset_root = "./dataset", max_seq_chord=300, max_seq_video=300, vis_models="2d/clip_l14p", emo_model="6c_l14p", split_ver="v1", random_seq=True, is_video=True): + + #dataset_root = "./dataset/", split="train", vis_models="", max_seq_chord=300, max_seq_video=300, random_seq=True, is_video = True): + train_dataset = VevoDataset( + dataset_root = dataset_root, split="train", split_ver=split_ver, + vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video, + random_seq=random_seq, is_video = is_video ) + + val_dataset = VevoDataset( + dataset_root = dataset_root, split="val", split_ver=split_ver, + vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video, + random_seq=random_seq, is_video = is_video ) + + test_dataset = VevoDataset( + dataset_root = dataset_root, split="test", split_ver=split_ver, + vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video, + random_seq=random_seq, is_video = is_video ) + + return train_dataset, val_dataset, test_dataset + + + +# V19 +def compute_vevo_accuracy(out, tgt): + softmax = nn.Softmax(dim=-1) + out = torch.argmax(softmax(out), dim=-1) + + out = out.flatten() + tgt = tgt.flatten() + + mask = (tgt != CHORD_PAD) + + out = out[mask] + tgt = tgt[mask] + + # Empty + if(len(tgt) == 0): + return 1.0 + + num_right = (out == tgt) + num_right = torch.sum(num_right).type(TORCH_FLOAT) + + acc = num_right / len(tgt) + + return acc + + + +def compute_hits_k(out, tgt, k): + softmax = nn.Softmax(dim=-1) + out = softmax(out) + _, topk_indices = torch.topk(out, k, dim=-1) # Get the indices of top-k values + + tgt = tgt.flatten() + + topk_indices = torch.squeeze(topk_indices, dim = 0) + + num_right = 0 + pt = 0 + for i, tlist in enumerate(topk_indices): + if tgt[i] == CHORD_PAD: + num_right += 0 + else: + pt += 1 + if tgt[i].item() in tlist: + num_right += 1 + + # Empty + if len(tgt) == 0: + return 1.0 + + num_right = torch.tensor(num_right, dtype=torch.float32) + hitk = num_right / pt + + return hitk + +def compute_hits_k_root_attr(out_root, out_attr, tgt, k): + softmax = nn.Softmax(dim=-1) + out_root = softmax(out_root) + out_attr = softmax(out_attr) + + tensor_shape = torch.Size([1, 299, 159]) + out = torch.zeros(tensor_shape) + for i in range(out.shape[-1]): + if i == 0 : + out[0, :, i] = out_root[0, :, 0] * out_attr[0, :, 0] + elif i == 157: + out[0, :, i] = out_root[0, :, 13] * out_attr[0, :, 14] + elif i == 158: + out[0, :, i] = out_root[0, :, 14] * out_attr[0, :, 15] + else: + rootindex = int( (i-1)/13 ) + 1 + attrindex = (i-1)%13 + 1 + out[0, :, i] = out_root[0, :, rootindex] * out_attr[0, :, attrindex] + + out = softmax(out) + _, topk_indices = torch.topk(out, k, dim=-1) # Get the indices of top-k values + + tgt = tgt.flatten() + + topk_indices = torch.squeeze(topk_indices, dim = 0) + + num_right = 0 + pt = 0 + for i, tlist in enumerate(topk_indices): + if tgt[i] == CHORD_PAD: + num_right += 0 + else: + pt += 1 + if tgt[i].item() in tlist: + num_right += 1 + + # Empty + if len(tgt) == 0: + return 1.0 + + num_right = torch.tensor(num_right, dtype=torch.float32) + hitk = num_right / pt + + return hitk + + + + +def compute_vevo_correspondence(out, tgt, tgt_emotion, tgt_emotion_prob, emotion_threshold): + + tgt_emotion = tgt_emotion.squeeze() + tgt_emotion_prob = tgt_emotion_prob.squeeze() + + dataset_root = "./dataset/" + chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json") + chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json") + chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json") + + chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json") + chordInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_inv.json") + + with open(chordRootInvDicPath) as json_file: + chordRootInvDic = json.load(json_file) + with open(chordAttrDicPath) as json_file: + chordAttrDic = json.load(json_file) + with open(chordAttrInvDicPath) as json_file: + chordAttrInvDic = json.load(json_file) + with open(chordDicPath) as json_file: + chordDic = json.load(json_file) + with open(chordInvDicPath) as json_file: + chordInvDic = json.load(json_file) + + softmax = nn.Softmax(dim=-1) + out = torch.argmax(softmax(out), dim=-1) + out = out.flatten() + + tgt = tgt.flatten() + + #out = tgt + + num_right = 0 + tgt_emotion_quality = tgt_emotion[:, 0:14] + pt = 0 + for i, out_element in enumerate( out ): + + all_zeros = torch.all(tgt_emotion_quality[i] == 0) + if tgt_emotion[i][-1] == 1 or all_zeros or tgt_emotion_prob[i] < emotion_threshold: + num_right += 0 + else: + pt += 1 + if out_element.item() != CHORD_END and out_element.item() != CHORD_PAD: + gen_chord = chordInvDic[ str( out_element.item() ) ] + + chord_arr = gen_chord.split(":") + if len(chord_arr) == 1: + out_quality = 1 + elif len(chord_arr) == 2: + chordAttrID = chordAttrDic[chord_arr[1]] + out_quality = chordAttrID # 0:N, 1:maj ... 13:maj7 + + if tgt_emotion_quality[i][out_quality] == 1: + num_right += 1 + + # tgt = tgt.flatten() + # mask = (tgt != CHORD_PAD) + # out = out[mask] + # tgt = tgt[mask] + # Empty + + if(len(tgt_emotion) == 0): + return 1.0 + + if(pt == 0): + return -1 + + num_right = torch.tensor(num_right, dtype=torch.float32) + + # num_right = (out == tgt) + # num_right = torch.sum(num_right).type(TORCH_FLOAT) + + acc = num_right / pt + + return acc + +def compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, emotion_threshold): + + tgt_emotion = tgt_emotion.squeeze() + tgt_emotion_prob = tgt_emotion_prob.squeeze() + + dataset_root = "./dataset/" + chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json") + chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json") + chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json") + + chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json") + chordInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_inv.json") + + with open(chordRootInvDicPath) as json_file: + chordRootInvDic = json.load(json_file) + with open(chordAttrDicPath) as json_file: + chordAttrDic = json.load(json_file) + with open(chordAttrInvDicPath) as json_file: + chordAttrInvDic = json.load(json_file) + with open(chordDicPath) as json_file: + chordDic = json.load(json_file) + with open(chordInvDicPath) as json_file: + chordInvDic = json.load(json_file) + + + softmax = nn.Softmax(dim=-1) + + y_root = torch.argmax(softmax(y_root), dim=-1) + y_attr = torch.argmax(softmax(y_attr), dim=-1) + + y_root = y_root.flatten() + y_attr = y_attr.flatten() + + tgt = tgt.flatten() + # mask = (tgt != CHORD_PAD) + # y = [] + y = np.empty( len(tgt) ) + + y.fill(CHORD_PAD) + + for i in range(len(tgt)): + if y_root[i].item() == CHORD_ROOT_PAD or y_attr[i].item() == CHORD_ATTR_PAD: + y[i] = CHORD_PAD + elif y_root[i].item() == CHORD_ROOT_END or y_attr[i].item() == CHORD_ATTR_END: + y[i] = CHORD_END + else: + chordRoot = chordRootInvDic[str(y_root[i].item())] + chordAttr = chordAttrInvDic[str(y_attr[i].item())] + if chordRoot == "N": + y[i] = 0 + else: + if chordAttr == "N" or chordAttr == "maj": + y[i] = chordDic[chordRoot] + else: + chord = chordRoot + ":" + chordAttr + y[i] = chordDic[chord] + + y = torch.from_numpy(y) + y = y.to(torch.long) + y = y.to(get_device()) + # y = y[mask] + # tgt = tgt[mask] + # y = torch.argmax(softmax(y), dim=-1) + y = y.flatten() + + #tgt = tgt.flatten() + #out = tgt + + num_right = 0 + tgt_emotion_quality = tgt_emotion[:, 0:14] + pt = 0 + for i, y_element in enumerate( y ): + all_zeros = torch.all(tgt_emotion_quality[i] == 0) + if tgt_emotion[i][-1] == 1 or all_zeros or tgt_emotion_prob[i] < emotion_threshold: + num_right += 0 + else: + pt += 1 + if y_element.item() != CHORD_END and y_element.item() != CHORD_PAD: + gen_chord = chordInvDic[ str( y_element.item() ) ] + chord_arr = gen_chord.split(":") + if len(chord_arr) == 1: + y_quality = 1 + elif len(chord_arr) == 2: + chordAttrID = chordAttrDic[chord_arr[1]] + y_quality = chordAttrID # 0:N, 1:maj ... 13:maj7 + + if tgt_emotion_quality[i][y_quality] == 1: + num_right += 1 + + # tgt = tgt.flatten() + # mask = (tgt != CHORD_PAD) + # out = out[mask] + # tgt = tgt[mask] + # Empty + + if(len(tgt_emotion) == 0): + return 1.0 + + if(pt == 0): + return -1 + + num_right = torch.tensor(num_right, dtype=torch.float32) + + # num_right = (out == tgt) + # num_right = torch.sum(num_right).type(TORCH_FLOAT) + + acc = num_right / pt + + return acc + + +# v19_2 : but not superier +def compute_vevo_accuracy_root_attr(y_root, y_attr, tgt): + + dataset_root = "./dataset/" + chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json") + chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json") + chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json") + + with open(chordRootInvDicPath) as json_file: + chordRootInvDic = json.load(json_file) + with open(chordAttrInvDicPath) as json_file: + chordAttrInvDic = json.load(json_file) + with open(chordDicPath) as json_file: + chordDic = json.load(json_file) + + softmax = nn.Softmax(dim=-1) + + y_root = torch.argmax(softmax(y_root), dim=-1) + y_attr = torch.argmax(softmax(y_attr), dim=-1) + + y_root = y_root.flatten() + y_attr = y_attr.flatten() + + + tgt = tgt.flatten() + + mask = (tgt != CHORD_PAD) + + # y = [] + y = np.empty( len(tgt) ) + y.fill(CHORD_PAD) + + for i in range(len(tgt)): + if y_root[i].item() == CHORD_ROOT_PAD or y_attr[i].item() == CHORD_ATTR_PAD: + y[i] = CHORD_PAD + elif y_root[i].item() == CHORD_ROOT_END or y_attr[i].item() == CHORD_ATTR_END: + y[i] = CHORD_END + else: + chordRoot = chordRootInvDic[str(y_root[i].item())] + chordAttr = chordAttrInvDic[str(y_attr[i].item())] + if chordRoot == "N": + y[i] = 0 + else: + if chordAttr == "N" or chordAttr == "maj": + y[i] = chordDic[chordRoot] + else: + chord = chordRoot + ":" + chordAttr + y[i] = chordDic[chord] + + + y = torch.from_numpy(y) + y = y.to(torch.long) + y = y.to(get_device()) + + y = y[mask] + tgt = tgt[mask] + + # Empty + if(len(tgt) == 0): + return 1.0 + + num_right = (y == tgt) + num_right = torch.sum(num_right).type(TORCH_FLOAT) + + acc = num_right / len(tgt) + + return acc + +# def compute_vevo_accuracy_root(y_root, tgt_root): +# softmax = nn.Softmax(dim=-1) +# y_root = torch.argmax(softmax(y_root), dim=-1) +# y_root = y_root.flatten() +# tgt_root = tgt_root.flatten() +# mask = (tgt_root != CHORD_ROOT_PAD) +# y_root = y_root[mask] +# tgt_root = tgt_root[mask] +# # Empty +# if(len(tgt_root) == 0): +# return 1.0 +# num_right = (y_root == tgt_root) +# num_right = torch.sum(num_right).type(TORCH_FLOAT) +# acc = num_right / len(tgt_root) +# return acc + +# def compute_vevo_accuracy_attr(y_attr, tgt_attr): +# softmax = nn.Softmax(dim=-1) +# y_attr = torch.argmax(softmax(y_attr), dim=-1) +# y_attr = y_attr.flatten() +# tgt_attr = tgt_attr.flatten() +# mask = (tgt_attr != CHORD_ATTR_PAD) +# y_attr = y_attr[mask] +# tgt_attr = tgt_attr[mask] +# # Empty +# if(len(tgt_attr) == 0): +# return 1.0 +# num_right = (y_attr == tgt_attr) +# num_right = torch.sum(num_right).type(TORCH_FLOAT) +# acc = num_right / len(tgt_attr) +# return acc + +# def compute_loudness_accuracy(out, feature_loudness): +# softmax = nn.Softmax(dim=-1) + +# out = torch.argmax(softmax(out), dim=-1) + +# out = out.flatten() +# feature_loudness = feature_loudness.flatten() + +# mask = (tgt != CHORD_PAD) + +# out = out[mask] +# tgt = tgt[mask] + +# # Empty +# if(len(tgt) == 0): +# return 1.0 + +# num_right = (out == tgt) +# num_right = torch.sum(num_right).type(TORCH_FLOAT) + +# acc = num_right / len(tgt) + +# return acc + +# def compute_note_density_accuracy(out, feature_note_density): +# softmax = nn.Softmax(dim=-1) +# out = torch.argmax(softmax(out), dim=-1) + +# out = out.flatten() +# feature_note_density = feature_note_density.flatten() + +# mask = (tgt != CHORD_PAD) + +# out = out[mask] +# tgt = tgt[mask] + +# # Empty +# if(len(tgt) == 0): +# return 1.0 + +# num_right = (out == tgt) +# num_right = torch.sum(num_right).type(TORCH_FLOAT) + +# acc = num_right / len(tgt) + +# return acc \ No newline at end of file