Skip to content

Commit

Permalink
Add source tag (#317)
Browse files Browse the repository at this point in the history
* add source tag for some mapper op
  • Loading branch information
chenhesen authored Jun 26, 2024
1 parent c749a28 commit 9b337fc
Show file tree
Hide file tree
Showing 14 changed files with 131 additions and 9 deletions.
11 changes: 11 additions & 0 deletions data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List, Optional

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints

Expand Down Expand Up @@ -50,8 +51,12 @@ def __init__(
def process(self, sample):
# there is no audio in this sample
if self.audio_key not in sample or not sample[self.audio_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.audio_key]

if self.filter_name is None:
return sample

Expand All @@ -71,5 +76,11 @@ def process(self, sample):
overwrite_output=self.overwrite_output)
processed[audio_key] = output_key

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(loaded_audio_keys):
if sample[Fields.source_file][i] != value:
if processed[value] != value:
sample[Fields.source_file][i] = value

sample[self.audio_key] = [processed[key] for key in loaded_audio_keys]
return sample
25 changes: 19 additions & 6 deletions data_juicer/ops/mapper/image_blur_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,28 +56,41 @@ def __init__(self,
def process(self, sample, context=False):
# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.image_key]

# load images
loaded_image_keys = sample[self.image_key]
sample, images = load_data_with_context(sample, context,
loaded_image_keys, load_image)
processed = {}
for image_key in loaded_image_keys:
if image_key in processed:
continue

for index, value in enumerate(loaded_image_keys):
if self.p < np.random.rand():
continue
processed[image_key] = image_key
else:
blured_image_key = transfer_filename(value, OP_NAME,
blured_image_key = transfer_filename(image_key, OP_NAME,
**self._init_parameters)
if not os.path.exists(
blured_image_key) or blured_image_key not in images:
blured_image = images[value].convert('RGB').filter(
blured_image = images[image_key].convert('RGB').filter(
self.blur)
images[blured_image_key] = blured_image
blured_image.save(blured_image_key)
if context:
sample[Fields.context][blured_image_key] = blured_image
loaded_image_keys[index] = blured_image_key
processed[image_key] = blured_image_key

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(loaded_image_keys):
if sample[Fields.source_file][i] != value:
if processed[value] != value:
sample[Fields.source_file][i] = value

sample[self.image_key] = loaded_image_keys
sample[self.image_key] = [processed[key] for key in loaded_image_keys]
return sample
10 changes: 10 additions & 0 deletions data_juicer/ops/mapper/image_face_blur_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,12 @@ def __init__(self,
def process(self, sample, context=False):
# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.image_key]

# load images
loaded_image_keys = sample[self.image_key]
sample, images = load_data_with_context(sample, context,
Expand Down Expand Up @@ -108,6 +112,12 @@ def process(self, sample, context=False):
else:
key_mapping[key] = key

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(loaded_image_keys):
if sample[Fields.source_file][i] != value:
if key_mapping[value] != value:
sample[Fields.source_file][i] = value

sample[self.image_key] = [
key_mapping[key] for key in loaded_image_keys
]
Expand Down
11 changes: 11 additions & 0 deletions data_juicer/ops/mapper/video_face_blur_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import av

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
pil_to_opencv, process_each_frame)
Expand Down Expand Up @@ -68,8 +69,12 @@ def __init__(self,
def process(self, sample, context=False):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

loaded_video_keys = sample[self.video_key]
sample, videos = load_data_with_context(sample, context,
loaded_video_keys, load_video)
Expand All @@ -90,6 +95,12 @@ def process(self, sample, context=False):
if not context:
video.close()

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(loaded_video_keys):
if sample[Fields.source_file][i] != value:
if processed_video_keys[value] != value:
sample[Fields.source_file][i] = value

sample[self.video_key] = [
processed_video_keys[key] for key in loaded_video_keys
]
Expand Down
11 changes: 11 additions & 0 deletions data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List, Optional

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints

Expand Down Expand Up @@ -50,8 +51,12 @@ def __init__(
def process(self, sample):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

if self.filter_name is None:
return sample

Expand All @@ -71,5 +76,11 @@ def process(self, sample):
overwrite_output=self.overwrite_output)
processed[video_key] = output_key

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(loaded_video_keys):
if sample[Fields.source_file][i] != value:
if processed[value] != value:
sample[Fields.source_file][i] = value

sample[self.video_key] = [processed[key] for key in loaded_video_keys]
return sample
11 changes: 11 additions & 0 deletions data_juicer/ops/mapper/video_remove_watermark_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from jsonargparse.typing import List, PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
from data_juicer.utils.mm_utils import (extract_video_frames_uniformly,
Expand Down Expand Up @@ -202,8 +203,12 @@ def _clean_watermark(self, frame, watermark_mask):
def process(self, sample, context=False):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

loaded_video_keys = sample[self.video_key]
sample, videos = load_data_with_context(sample, context,
loaded_video_keys, load_video)
Expand All @@ -230,5 +235,11 @@ def process_frame_func(frame):
for vid_key in videos:
videos[vid_key].close()

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(sample[self.video_key]):
if sample[Fields.source_file][i] != value:
if loaded_video_keys[i] != value:
sample[Fields.source_file][i] = value

sample[self.video_key] = loaded_video_keys
return sample
11 changes: 11 additions & 0 deletions data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from fractions import Fraction

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
from data_juicer.utils.mm_utils import load_video
Expand Down Expand Up @@ -102,8 +103,12 @@ def __init__(
def process(self, sample):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

loaded_video_keys = sample[self.video_key]
for index, video_key in enumerate(loaded_video_keys):

Expand Down Expand Up @@ -139,5 +144,11 @@ def process(self, sample):
stream.run()
loaded_video_keys[index] = resized_video_key

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(sample[self.video_key]):
if sample[Fields.source_file][i] != value:
if loaded_video_keys[i] != value:
sample[Fields.source_file][i] = value

sample[self.video_key] = loaded_video_keys
return sample
11 changes: 11 additions & 0 deletions data_juicer/ops/mapper/video_resize_resolution_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from jsonargparse.typing import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
from data_juicer.utils.mm_utils import load_video
Expand Down Expand Up @@ -86,8 +87,12 @@ def __init__(self,
def process(self, sample, context=False):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

loaded_video_keys = sample[self.video_key]

for index, video_key in enumerate(loaded_video_keys):
Expand Down Expand Up @@ -163,5 +168,11 @@ def process(self, sample, context=False):

loaded_video_keys[index] = resized_video_key

# when the file is modified, its source file needs to be updated.
for i, value in enumerate(sample[self.video_key]):
if sample[Fields.source_file][i] != value:
if loaded_video_keys[i] != value:
sample[Fields.source_file][i] = value

sample[self.video_key] = loaded_video_keys
return sample
8 changes: 8 additions & 0 deletions data_juicer/ops/mapper/video_split_by_duration_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np

from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import (add_suffix_to_filename,
transfer_filename)
from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds,
Expand Down Expand Up @@ -85,11 +86,16 @@ def _process_single_sample(self, sample):
# there is no video in this sample
if self.video_key not in sample or sample[
self.video_key] is None or len(sample[self.video_key]) == 0:
sample[Fields.source_file] = []
return []

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

# the split results
split_sample = copy.deepcopy(sample)
split_sample[self.text_key] = ''
split_sample[Fields.source_file] = []

# load all video(s)
loaded_video_keys = sample[self.video_key]
Expand Down Expand Up @@ -119,6 +125,8 @@ def _process_single_sample(self, sample):
split_video_keys.extend(new_video_keys)
place_holders.append(SpecialTokens.video *
len(new_video_keys))
split_sample[Fields.source_file].extend(
[video_key] * len(new_video_keys))

# insert the generated text according to given mode
replacer_function = create_replacer(place_holders)
Expand Down
8 changes: 8 additions & 0 deletions data_juicer/ops/mapper/video_split_by_key_frame_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
import re

from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import (add_suffix_to_filename,
transfer_filename)
from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds,
Expand Down Expand Up @@ -68,11 +69,16 @@ def _process_single_sample(self, sample):
# there is no video in this sample
if self.video_key not in sample or sample[
self.video_key] is None or len(sample[self.video_key]) == 0:
sample[Fields.source_file] = []
return []

if Fields.source_file not in sample or not sample[Fields.source_file]:
sample[Fields.source_file] = sample[self.video_key]

# the split results
split_sample = copy.deepcopy(sample)
split_sample[self.text_key] = ''
split_sample[Fields.source_file] = []

# load all video(s)
loaded_video_keys = sample[self.video_key]
Expand Down Expand Up @@ -101,6 +107,8 @@ def _process_single_sample(self, sample):
split_video_keys.extend(new_video_keys)
place_holders.append(SpecialTokens.video *
len(new_video_keys))
split_sample[Fields.source_file].extend(
[video_key] * len(new_video_keys))

# insert the generated text according to given mode
replacer_function = create_replacer(place_holders)
Expand Down
8 changes: 8 additions & 0 deletions data_juicer/ops/mapper/video_split_by_scene_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from jsonargparse.typing import NonNegativeFloat, NonNegativeInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import (add_suffix_to_filename,
transfer_filename)
from data_juicer.utils.mm_utils import SpecialTokens
Expand Down Expand Up @@ -84,6 +85,7 @@ def __init__(self,
def process(self, sample, context=False):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

# load videos
Expand Down Expand Up @@ -137,6 +139,12 @@ def process(self, sample, context=False):
sample[self.text_key])
sample[self.text_key] = updated_text

# when the file is modified, its source file needs to be updated.
sample[Fields.source_file] = []
for value in loaded_video_keys:
sample[Fields.source_file].extend([value] *
len(output_video_keys[value]))

sample[self.video_key] = list(
chain.from_iterable(
[output_video_keys[key] for key in loaded_video_keys]))
Expand Down
3 changes: 3 additions & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class Fields(object):
video_frame_tags = DEFAULT_PREFIX + 'video_frame_tags__'
video_audio_tags = DEFAULT_PREFIX + 'video_audio_tags__'

# the name of the original file from which this sample was derived.
source_file = DEFAULT_PREFIX + 'source_file__'

# the name of diretory to store the produced multimodal data
multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__'

Expand Down
5 changes: 4 additions & 1 deletion tests/ops/mapper/test_video_split_by_duration_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ def _get_res_list(self, dataset, source_list):

# for keep_original_sample=True
if set(output_paths) <= set(origin_paths):
res_list.append(sample)
res_list.append({
'text': sample['text'],
'videos': sample['videos']
})
continue

source = source_list[idx]
Expand Down
Loading

0 comments on commit 9b337fc

Please sign in to comment.