Skip to content

Commit

Permalink
refactor: remove tone features
Browse files Browse the repository at this point in the history
since panphon already deals with them

also adds parentheses support in pf encoding
  • Loading branch information
roedoejet committed Dec 13, 2024
1 parent e4e8159 commit 4026f09
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 76 deletions.
4 changes: 4 additions & 0 deletions everyvoice/config/text_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ class Punctuation(BaseModel):
['"', "'", "“", "”", "«", "»"],
description="Quotemark punctuation symbols used in your datasets. Replaces these symbols with <QUOTE> internally.",
)
parentheses: list[str] = Field(
["(", ")", "[", "]", "{", "}"],
description="Punctuation symbols indicating parentheses, brackets, or braces. Replaces these symbols with <PAREN> internally.",
)
big_breaks: list[str] = Field(
[".", ":", ";"],
description="Punctuation symbols indicating a 'big break' used in your datasets. Replaces these symbols with <BB> internally.",
Expand Down
123 changes: 47 additions & 76 deletions everyvoice/text/features.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,44 @@
from typing import List
from unicodedata import normalize

import numpy as np
import numpy.typing as npt
from panphon import FeatureTable

from everyvoice.config.text_config import TextConfig

N_PHONOLOGICAL_FEATURES = 46
N_PHONOLOGICAL_FEATURES = 40


# TODO: support primary and secondary stress
class PhonologicalFeatureCalculator:
def __init__(self, text_config: TextConfig, punctuation_hash: dict):
self.config = text_config
self.punctuation_hash = punctuation_hash
self.feature_table = FeatureTable()

def normalize_features(
self, raw_phonological_features: npt.NDArray[np.float32]
) -> npt.NDArray[np.float32]:
"""Applies a (x + 1) / 2 scaling to all phonological features to transform from between -1 and 1 to between 0 and 1.
Args:
raw_phonological_features (npt.NDArray[np.float32]): phonological features valued between -1 and 1
Returns:
npt.NDArray[np.float32]: phonological features valued between 0 and 1.
"""
return (raw_phonological_features + 1) / 2

def denormalize_features(
self, normalized_phonological_features: npt.NDArray[np.float32]
) -> npt.NDArray[np.float32]:
"""Applies a (x * 2) - 1 scaling to all phonological features to transform from between 0 and 1 to between -1 and 1.
Args:
raw_phonological_features (npt.NDArray[np.float32]): phonological features valued between 0 and 1
Returns:
npt.NDArray[np.float32]: phonological features valued between -1 and 1.
"""
return (normalized_phonological_features * 2) - 1

def mask_token(self):
return self.get_features(["[MASK]"])[0]

Expand All @@ -32,54 +54,6 @@ def sep_token(self):
def unk_token(self):
return self.get_features(["[UNK]"])[0]

def get_tone_features(self, text: List[str]) -> npt.NDArray[np.float32]:
# TODO: sort out how to define encoding of tone features
"""Return Wang (1967) style tone features.
- Contour
- High
- Central
- Mid
- Rising
- Falling
- Convex
*If your language uses phonemic tone you MUST amend this function to match your language
Panphon does not use these features.*
Args:
text (list(str)): segmented phones
"""
tone_features = []
high_tone_chars = [
normalize("NFC", x)
for x in [
"áː",
"á",
"ʌ̃́ː",
"ʌ̃́",
"éː",
"é",
"íː",
"í",
"ṹː",
"ṹ",
"óː",
"ó",
]
]
low_tone_chars = [
normalize("NFC", x) for x in ["òː", "ũ̀ː", "ìː", "èː", "ʌ̃̀ː", "àː"]
]
for char in text:
char = normalize("NFC", char)
if char in high_tone_chars:
tone_features.append([-1, 1, -1, -1, -1, -1, -1])
elif char in low_tone_chars:
tone_features.append([-1, -1, -1, -1, -1, -1, -1])
else:
tone_features.append([0, 0, 0, 0, 0, 0, 0])
return np.array(tone_features, dtype=np.float32)

def get_punctuation_features(self, tokens: list[str]) -> npt.NDArray[np.float32]:
"""Get Punctuation features.
One-hot encodes the allowable types of punctuation and returns zeros elsewhere
Expand All @@ -90,36 +64,38 @@ def get_punctuation_features(self, tokens: list[str]) -> npt.NDArray[np.float32]
Returns:
npt.NDArray[np.float32]: a seven-dimensional one-hot encoding of punctuation, white space and silence
>>> punc_hash = {"exclamations": "<EXCL>", "question_symbols": "<QINT>", "quotemarks": "<QUOTE>", "big_breaks": "<BB>", "small_breaks": "<SB>", "ellipsis": "<EPS>"}
>>> punc_hash = {"exclamations": "<EXCL>", "question_symbols": "<QINT>", "quotemarks": "<QUOTE>", "big_breaks": "<BB>", "small_breaks": "<SB>", "ellipsis": "<EPS>", "parentheses": "<PAREN>"}
>>> pf = PhonologicalFeatureCalculator(TextConfig(), punc_hash)
>>> pf.get_punctuation_features(['h', 'ʌ', 'l', 'o', 'ʊ', '<EXCL>'])
array([[0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32)
array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32)
"""
punctuation_features = []
for char in tokens:
if char == " ":
punctuation_features.append([1, 0, 0, 0, 0, 0, 0, 0])
punctuation_features.append([1, 0, 0, 0, 0, 0, 0, 0, 0])
elif char == self.punctuation_hash["question_symbols"]:
punctuation_features.append([0, 1, 0, 0, 0, 0, 0, 0])
punctuation_features.append([0, 1, 0, 0, 0, 0, 0, 0, 0])
elif char == self.punctuation_hash["big_breaks"]:
punctuation_features.append([0, 0, 1, 0, 0, 0, 0, 0])
punctuation_features.append([0, 0, 1, 0, 0, 0, 0, 0, 0])
elif char == self.punctuation_hash["small_breaks"]:
punctuation_features.append([0, 0, 0, 1, 0, 0, 0, 0])
punctuation_features.append([0, 0, 0, 1, 0, 0, 0, 0, 0])
elif char == self.punctuation_hash["quotemarks"]:
punctuation_features.append([0, 0, 0, 0, 1, 0, 0, 0])
punctuation_features.append([0, 0, 0, 0, 1, 0, 0, 0, 0])
elif char == self.punctuation_hash["parentheses"]:
punctuation_features.append([0, 0, 0, 0, 0, 1, 0, 0, 0])
elif char == self.punctuation_hash["ellipsis"]:
punctuation_features.append([0, 0, 0, 0, 0, 1, 0, 0])
punctuation_features.append([0, 0, 0, 0, 0, 0, 1, 0, 0])
elif char == self.punctuation_hash["exclamations"]:
punctuation_features.append([0, 0, 0, 0, 0, 0, 1, 0])
punctuation_features.append([0, 0, 0, 0, 0, 0, 0, 1, 0])
elif char in self.config.symbols.silence:
punctuation_features.append([0, 0, 0, 0, 0, 0, 0, 1])
punctuation_features.append([0, 0, 0, 0, 0, 0, 0, 0, 1])
else:
punctuation_features.append([0, 0, 0, 0, 0, 0, 0, 0])
punctuation_features.append([0, 0, 0, 0, 0, 0, 0, 0, 0])
return np.array(punctuation_features, dtype=np.float32)

def get_stress_features(self, tokens: list[str]) -> npt.NDArray[np.float32]:
Expand Down Expand Up @@ -160,7 +136,7 @@ def get_special_token_features(self, tokens: list[str]) -> npt.NDArray[np.float3
>>> punc_hash = {"exclamations": "<EXCL>", "question_symbols": "<QINT>", "quotemarks": "<QUOTE>", "big_breaks": "<BB>", "small_breaks": "<SB>", "ellipsis": "<EPS>"}
>>> pf = PhonologicalFeatureCalculator(TextConfig(), punc_hash)
>>> pf.get_special_token_features(['\x80', '[UNK]', '[CLS]', '[SEP]', '[MASK]' ])
>>> pf.get_special_token_features(['\x80', '[MASK]', '[CLS]', '[SEP]', '[UNK]' ])
array([[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
Expand Down Expand Up @@ -241,18 +217,13 @@ def get_features(self, tokens: list[str]) -> npt.NDArray[np.float32]:
punctuation_features = self.get_punctuation_features(tokens)
stress_features = self.get_stress_features(tokens)
special_token_features = self.get_special_token_features(tokens)
tone_features = self.get_tone_features(tokens)
seg_features = np.vstack([self.token_to_segmental_features(t) for t in tokens])
assert (
len(punctuation_features)
== len(tone_features)
== len(seg_features)
== len(stress_features)
len(punctuation_features) == len(seg_features) == len(stress_features)
), "There should be the same number of segments among segmental, tone, stress, punctuation features"
return np.concatenate(
[
seg_features,
tone_features,
stress_features,
punctuation_features,
special_token_features,
Expand Down
1 change: 1 addition & 0 deletions everyvoice/text/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(self, config: TextConfig):
"big_breaks": "<BB>",
"small_breaks": "<SB>",
"ellipsis": "<EPS>",
"parentheses": "<PAREN>",
}
# Create a hash table from punctuation to the internal ID
self.punctuation_to_internal_id = {
Expand Down

0 comments on commit 4026f09

Please sign in to comment.