Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add zipformer recipe for audio tagging #1421

Merged
merged 33 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
baa03c7
initial commit
marcoyang1998 Dec 19, 2023
a1aca34
add datamodule for audioset
marcoyang1998 Dec 19, 2023
bf58b63
minor fix
marcoyang1998 Dec 19, 2023
57ff00d
add softlink
marcoyang1998 Dec 19, 2023
bd01c21
add evaluation script
marcoyang1998 Dec 19, 2023
3e22108
update the manifest
marcoyang1998 Mar 20, 2024
1279355
Merge branch 'master' of github.com:marcoyang1998/icefall into audio_…
marcoyang1998 Mar 20, 2024
4e14800
add export.py
marcoyang1998 Mar 20, 2024
219d55d
support exporting the pretrained model
marcoyang1998 Mar 20, 2024
1921692
add file
marcoyang1998 Mar 20, 2024
9c4db1b
add inference script with a pretrained model
marcoyang1998 Mar 20, 2024
4bce81b
fix style
marcoyang1998 Mar 26, 2024
18479fc
Merge remote-tracking branch 'origin' into audio_tagging
marcoyang1998 Mar 26, 2024
7a8c9b7
fix style
marcoyang1998 Mar 26, 2024
f4c1872
enhance documentation
marcoyang1998 Mar 26, 2024
64dbcd0
minor changes
marcoyang1998 Mar 26, 2024
8b234b3
fix doc
marcoyang1998 Mar 26, 2024
a8ca029
fix the comments; wrap the classifier for jit script
marcoyang1998 Mar 29, 2024
2d1072f
add a file to test jit script model
marcoyang1998 Mar 29, 2024
6a7ac68
minor updates
marcoyang1998 Mar 29, 2024
5a4b712
update comments in evaluate.py
marcoyang1998 Mar 29, 2024
9e9bc75
minor updates
marcoyang1998 Mar 29, 2024
39e7de4
add readme and results
marcoyang1998 Mar 29, 2024
ff2975d
support export onnx model
marcoyang1998 Mar 29, 2024
7bd679f
add onnx pretrained
marcoyang1998 Mar 29, 2024
686d2d9
minor updates
marcoyang1998 Mar 29, 2024
f3e8e42
fix style
marcoyang1998 Apr 7, 2024
01b744f
support onnx export with batch size 1; also works for batch processin…
marcoyang1998 Apr 7, 2024
25d22d9
update the script to generate audioset manfiest
marcoyang1998 Apr 8, 2024
ff484be
add prepare.sh
marcoyang1998 Apr 8, 2024
1ca4646
add missing files
marcoyang1998 Apr 8, 2024
864914f
update comments
marcoyang1998 Apr 8, 2024
b134889
add link to audioset
marcoyang1998 Apr 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions egs/audioset/AT/local/generate_audioset_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (authors: Xiaoyu Yang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This file generates the manifest and computes the fbank features for AudioSet
dataset. The generated manifests and features are stored in data/fbank.
"""

import argparse
marcoyang1998 marked this conversation as resolved.
Show resolved Hide resolved
import csv
import glob
import logging
import os

import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.audio import Recording
from lhotse.cut import MonoCut
from lhotse.supervision import SupervisionSegment

from icefall.utils import get_executor

torch.set_num_threads(1)
torch.set_num_interop_threads(1)

marcoyang1998 marked this conversation as resolved.
Show resolved Hide resolved

def parse_csv(csv_file):
# The content of the csv file shoud be something like this
# ------------------------------------------------------
# filename label
# dataset/AudioSet/balanced/xxxx.wav 0;451
# dataset/AudioSet/balanced/xxxy.wav 375
# ------------------------------------------------------
mapping = {}
with open(csv_file, "r") as fin:
reader = csv.reader(fin, delimiter="\t")
for i, row in enumerate(reader):
if i == 0:
continue
key = "/".join(row[0].split("/")[-2:])
mapping[key] = row[1]
return mapping


def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument("--dataset-dir", type=str, default="downloads/audioset")

parser.add_argument(
"--split",
type=str,
default="balanced",
choices=["balanced", "unbalanced", "eval", "eval_all"],
)

parser.add_argument(
"--feat-output-dir",
type=str,
default="data/fbank",
)

return parser


def main():
parser = get_parser()
args = parser.parse_args()

dataset_dir = args.dataset_dir
split = args.split
feat_output_dir = args.feat_output_dir

num_jobs = min(15, os.cpu_count())
num_mel_bins = 80

if split in ["balanced", "unbalanced"]:
csv_file = "downloads/audioset/full_train_asedata_with_duration.csv"
elif split == "eval":
csv_file = "downloads/audioset/eval.csv"
elif split == "eval_all":
csv_file = "downloads/audioset/eval_all.csv"
else:
raise ValueError()

labels = parse_csv(csv_file)

audio_files = glob.glob(f"{dataset_dir}/eval/wav_all/*.wav")

new_cuts = []
for i, audio in enumerate(audio_files):
cut_id = "/".join(audio.split("/")[-2:])
recording = Recording.from_file(audio, cut_id)
cut = MonoCut(
id=cut_id,
start=0.0,
duration=recording.duration,
channel=0,
recording=recording,
)
supervision = SupervisionSegment(
id=cut_id,
recording_id=cut.recording.id,
start=0.0,
channel=0,
duration=cut.duration,
)
try:
supervision.audio_event = labels[cut_id]
except KeyError:
logging.info(f"No labels found for {cut_id}.")
continue
cut.supervisions = [supervision]
new_cuts.append(cut)

if i % 100 == 0 and i:
logging.info(f"Processed {i} cuts until now.")

cuts = CutSet.from_cuts(new_cuts)

extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

logging.info(f"Computing fbank features for {split}")
with get_executor() as ex:
cuts = cuts.compute_and_store_features(
extractor=extractor,
storage_path=f"{feat_output_dir}/{split}_{split}_feats",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)

manifest_output_dir = feat_output_dir + "/" + f"cuts_audioset_{split}.jsonl.gz"

logging.info(f"Storing the manifest to {manifest_output_dir}")
cuts.to_jsonl(manifest_output_dir)


if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

logging.basicConfig(format=formatter, level=logging.INFO)
main()
Loading
Loading