Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merging whisper 1st revision code #169

Open
wants to merge 3 commits into
base: whisper-1st-rev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ endif
# {echo | python}
%-pickle: PRJCT_ID := tfs
# {tfs | podcast}
%-pickle: SID_LIST = 676
%-pickle: SID_LIST = 625
# {625 676 7170 798 | 661 662 717 723 741 742 743 763 798 | 777}

create-pickle:
Expand Down Expand Up @@ -94,9 +94,9 @@ download-247-pickles:
## settings for targets: generate-embeddings, concatenate-embeddings
%-embeddings: PRJCT_ID := tfs
# {tfs | podcast}
%-embeddings: SID := 625
%-embeddings: SID := 798
# {625 | 676 | 7170 | 798 | 661}
%-embeddings: CONV_IDS = $(shell seq 1 54)
%-embeddings: CONV_IDS = $(shell seq 1 15)
# {54 for 625 | 78 for 676 | 1 for 661 | 24 for 7170 | 15 for 798}
%-embeddings: PKL_IDENTIFIER := full
# {full | trimmed | binned}
Expand All @@ -116,8 +116,10 @@ download-247-pickles:


# Whisper specific args
%-embeddings: MDL_TYPE := en-only
# {full | full-onset | full_n-1 | en-only | de-only}
%-embeddings: MDL_TYPE := full
# {full | full-onset | full-n-1 | en-only | de-only}
%-embeddings: BIN_TYPE := fixed-bin
# {var-bin | fixed-bin}
%-embeddings: SHUFFLE_AUDIO := none
# {none | samples | phonemes | words | 2-words | flip}
%-embeddings: SHUFFLE_WORDS := none
Expand All @@ -136,15 +138,12 @@ download-247-pickles:
# %-embeddings: RCTXP := --rctxp
# %-embeddings: UTT := --utt



# Note: embeddings file is the same for all podcast subjects \
and hence only generate once using subject: 661
%-embeddings: JOB_NAME = $(subst /,-,$(EMB_TYPE))
%-embeddings: CMD = sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh
# {echo | python | sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh}


# generate-base-for-embeddings: Generates the base dataframe for embedding generation
generate-base-for-embeddings:
python scripts/tfsemb_LMBase.py \
Expand All @@ -165,6 +164,7 @@ generate-embeddings:
--conversation-id $$conv_id \
--embedding-type $(EMB_TYPE) \
--model-type $(MDL_TYPE) \
--bin-type $(BIN_TYPE) \
--shuffle-audio $(SHUFFLE_AUDIO) \
--shuffle-words $(SHUFFLE_WORDS) \
--cutoff $(CUTOFF) \
Expand Down Expand Up @@ -200,7 +200,7 @@ copy-embeddings:

# Download huggingface models to cache (before generating embeddings)
# This target needs to be run on the head node
cache-models: MODEL := openai/whisper-medium.en
cache-models: MODEL := openai/whisper-tiny.en
# {causal | seq2seq | mlm | or any model name specified in EMB_TYPE comments}
cache-models:
python -c "from scripts import tfsemb_download; tfsemb_download.download_tokenizers_and_models(\"$(MODEL)\")"
1 change: 1 addition & 0 deletions scripts/tfsemb_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def setup_environ(args):
stra,
"layer_%02d",
)

args.output_file_name = args.conversation_list[args.conversation_id - 1]
args.output_file = os.path.join(args.output_dir, args.output_file_name)

Expand Down
Loading