Add source tag (#317)

* add source tag for some mapper op
modelscope · Jun 26, 2024 · 9b337fc · 9b337fc
1 parent c749a28
commit 9b337fc
Show file tree

Hide file tree

Showing 14 changed files with 131 additions and 9 deletions.
diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import transfer_filename
 from data_juicer.utils.logger_utils import HiddenPrints
 
@@ -50,8 +51,12 @@ def __init__(
     def process(self, sample):
         # there is no audio in this sample
         if self.audio_key not in sample or not sample[self.audio_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.audio_key]
+
         if self.filter_name is None:
             return sample
 
@@ -71,5 +76,11 @@ def process(self, sample):
                        overwrite_output=self.overwrite_output)
             processed[audio_key] = output_key
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(loaded_audio_keys):
+            if sample[Fields.source_file][i] != value:
+                if processed[value] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.audio_key] = [processed[key] for key in loaded_audio_keys]
         return sample
diff --git a/data_juicer/ops/mapper/image_blur_mapper.py b/data_juicer/ops/mapper/image_blur_mapper.py
@@ -56,28 +56,41 @@ def __init__(self,
     def process(self, sample, context=False):
         # there is no image in this sample
         if self.image_key not in sample or not sample[self.image_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.image_key]
+
         # load images
         loaded_image_keys = sample[self.image_key]
         sample, images = load_data_with_context(sample, context,
                                                 loaded_image_keys, load_image)
+        processed = {}
+        for image_key in loaded_image_keys:
+            if image_key in processed:
+                continue
 
-        for index, value in enumerate(loaded_image_keys):
             if self.p < np.random.rand():
-                continue
+                processed[image_key] = image_key
             else:
-                blured_image_key = transfer_filename(value, OP_NAME,
+                blured_image_key = transfer_filename(image_key, OP_NAME,
                                                      **self._init_parameters)
                 if not os.path.exists(
                         blured_image_key) or blured_image_key not in images:
-                    blured_image = images[value].convert('RGB').filter(
+                    blured_image = images[image_key].convert('RGB').filter(
                         self.blur)
                     images[blured_image_key] = blured_image
                     blured_image.save(blured_image_key)
                     if context:
                         sample[Fields.context][blured_image_key] = blured_image
-                loaded_image_keys[index] = blured_image_key
+                processed[image_key] = blured_image_key
+
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(loaded_image_keys):
+            if sample[Fields.source_file][i] != value:
+                if processed[value] != value:
+                    sample[Fields.source_file][i] = value
 
-        sample[self.image_key] = loaded_image_keys
+        sample[self.image_key] = [processed[key] for key in loaded_image_keys]
         return sample
diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py
@@ -69,8 +69,12 @@ def __init__(self,
     def process(self, sample, context=False):
         # there is no image in this sample
         if self.image_key not in sample or not sample[self.image_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.image_key]
+
         # load images
         loaded_image_keys = sample[self.image_key]
         sample, images = load_data_with_context(sample, context,
@@ -108,6 +112,12 @@ def process(self, sample, context=False):
             else:
                 key_mapping[key] = key
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(loaded_image_keys):
+            if sample[Fields.source_file][i] != value:
+                if key_mapping[value] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.image_key] = [
             key_mapping[key] for key in loaded_image_keys
         ]

diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py
@@ -1,6 +1,7 @@
 import av
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import transfer_filename
 from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
                                         pil_to_opencv, process_each_frame)
@@ -68,8 +69,12 @@ def __init__(self,
     def process(self, sample, context=False):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         loaded_video_keys = sample[self.video_key]
         sample, videos = load_data_with_context(sample, context,
                                                 loaded_video_keys, load_video)
@@ -90,6 +95,12 @@ def process(self, sample, context=False):
             if not context:
                 video.close()
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(loaded_video_keys):
+            if sample[Fields.source_file][i] != value:
+                if processed_video_keys[value] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.video_key] = [
             processed_video_keys[key] for key in loaded_video_keys
         ]

diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import transfer_filename
 from data_juicer.utils.logger_utils import HiddenPrints
 
@@ -50,8 +51,12 @@ def __init__(
     def process(self, sample):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         if self.filter_name is None:
             return sample
 
@@ -71,5 +76,11 @@ def process(self, sample):
                        overwrite_output=self.overwrite_output)
             processed[video_key] = output_key
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(loaded_video_keys):
+            if sample[Fields.source_file][i] != value:
+                if processed[value] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.video_key] = [processed[key] for key in loaded_video_keys]
         return sample
diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py
@@ -5,6 +5,7 @@
 from jsonargparse.typing import List, PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import transfer_filename
 from data_juicer.utils.logger_utils import HiddenPrints
 from data_juicer.utils.mm_utils import (extract_video_frames_uniformly,
@@ -202,8 +203,12 @@ def _clean_watermark(self, frame, watermark_mask):
     def process(self, sample, context=False):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         loaded_video_keys = sample[self.video_key]
         sample, videos = load_data_with_context(sample, context,
                                                 loaded_video_keys, load_video)
@@ -230,5 +235,11 @@ def process_frame_func(frame):
             for vid_key in videos:
                 videos[vid_key].close()
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(sample[self.video_key]):
+            if sample[Fields.source_file][i] != value:
+                if loaded_video_keys[i] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.video_key] = loaded_video_keys
         return sample
diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py
@@ -3,6 +3,7 @@
 from fractions import Fraction
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import transfer_filename
 from data_juicer.utils.logger_utils import HiddenPrints
 from data_juicer.utils.mm_utils import load_video
@@ -102,8 +103,12 @@ def __init__(
     def process(self, sample):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         loaded_video_keys = sample[self.video_key]
         for index, video_key in enumerate(loaded_video_keys):
 
@@ -139,5 +144,11 @@ def process(self, sample):
                 stream.run()
             loaded_video_keys[index] = resized_video_key
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(sample[self.video_key]):
+            if sample[Fields.source_file][i] != value:
+                if loaded_video_keys[i] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.video_key] = loaded_video_keys
         return sample
diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py
@@ -5,6 +5,7 @@
 from jsonargparse.typing import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import transfer_filename
 from data_juicer.utils.logger_utils import HiddenPrints
 from data_juicer.utils.mm_utils import load_video
@@ -86,8 +87,12 @@ def __init__(self,
     def process(self, sample, context=False):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.source_file] = []
             return sample
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         loaded_video_keys = sample[self.video_key]
 
         for index, video_key in enumerate(loaded_video_keys):
@@ -163,5 +168,11 @@ def process(self, sample, context=False):
 
             loaded_video_keys[index] = resized_video_key
 
+        # when the file is modified, its source file needs to be updated.
+        for i, value in enumerate(sample[self.video_key]):
+            if sample[Fields.source_file][i] != value:
+                if loaded_video_keys[i] != value:
+                    sample[Fields.source_file][i] = value
+
         sample[self.video_key] = loaded_video_keys
         return sample
diff --git a/data_juicer/ops/mapper/video_split_by_duration_mapper.py b/data_juicer/ops/mapper/video_split_by_duration_mapper.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import (add_suffix_to_filename,
                                           transfer_filename)
 from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds,
@@ -85,11 +86,16 @@ def _process_single_sample(self, sample):
         # there is no video in this sample
         if self.video_key not in sample or sample[
                 self.video_key] is None or len(sample[self.video_key]) == 0:
+            sample[Fields.source_file] = []
             return []
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         # the split results
         split_sample = copy.deepcopy(sample)
         split_sample[self.text_key] = ''
+        split_sample[Fields.source_file] = []
 
         # load all video(s)
         loaded_video_keys = sample[self.video_key]
@@ -119,6 +125,8 @@ def _process_single_sample(self, sample):
                     split_video_keys.extend(new_video_keys)
                     place_holders.append(SpecialTokens.video *
                                          len(new_video_keys))
+                    split_sample[Fields.source_file].extend(
+                        [video_key] * len(new_video_keys))
 
                 # insert the generated text according to given mode
                 replacer_function = create_replacer(place_holders)

diff --git a/data_juicer/ops/mapper/video_split_by_key_frame_mapper.py b/data_juicer/ops/mapper/video_split_by_key_frame_mapper.py
@@ -1,6 +1,7 @@
 import copy
 import re
 
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import (add_suffix_to_filename,
                                           transfer_filename)
 from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds,
@@ -68,11 +69,16 @@ def _process_single_sample(self, sample):
         # there is no video in this sample
         if self.video_key not in sample or sample[
                 self.video_key] is None or len(sample[self.video_key]) == 0:
+            sample[Fields.source_file] = []
             return []
 
+        if Fields.source_file not in sample or not sample[Fields.source_file]:
+            sample[Fields.source_file] = sample[self.video_key]
+
         # the split results
         split_sample = copy.deepcopy(sample)
         split_sample[self.text_key] = ''
+        split_sample[Fields.source_file] = []
 
         # load all video(s)
         loaded_video_keys = sample[self.video_key]
@@ -101,6 +107,8 @@ def _process_single_sample(self, sample):
                     split_video_keys.extend(new_video_keys)
                     place_holders.append(SpecialTokens.video *
                                          len(new_video_keys))
+                    split_sample[Fields.source_file].extend(
+                        [video_key] * len(new_video_keys))
 
                 # insert the generated text according to given mode
                 replacer_function = create_replacer(place_holders)

diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py
@@ -5,6 +5,7 @@
 from jsonargparse.typing import NonNegativeFloat, NonNegativeInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import (add_suffix_to_filename,
                                           transfer_filename)
 from data_juicer.utils.mm_utils import SpecialTokens
@@ -84,6 +85,7 @@ def __init__(self,
     def process(self, sample, context=False):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.source_file] = []
             return sample
 
         # load videos
@@ -137,6 +139,12 @@ def process(self, sample, context=False):
                 sample[self.text_key])
             sample[self.text_key] = updated_text
 
+        # when the file is modified, its source file needs to be updated.
+        sample[Fields.source_file] = []
+        for value in loaded_video_keys:
+            sample[Fields.source_file].extend([value] *
+                                              len(output_video_keys[value]))
+
         sample[self.video_key] = list(
             chain.from_iterable(
                 [output_video_keys[key] for key in loaded_video_keys]))

diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -19,6 +19,9 @@ class Fields(object):
     video_frame_tags = DEFAULT_PREFIX + 'video_frame_tags__'
     video_audio_tags = DEFAULT_PREFIX + 'video_audio_tags__'
 
+    # the name of the original file from which this sample was derived.
+    source_file = DEFAULT_PREFIX + 'source_file__'
+
     # the name of diretory to store the produced multimodal data
     multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__'
 

diff --git a/tests/ops/mapper/test_video_split_by_duration_mapper.py b/tests/ops/mapper/test_video_split_by_duration_mapper.py
@@ -28,7 +28,10 @@ def _get_res_list(self, dataset, source_list):
 
             # for keep_original_sample=True
             if set(output_paths) <= set(origin_paths):
-                res_list.append(sample)
+                res_list.append({
+                    'text': sample['text'],
+                    'videos': sample['videos']
+                })
                 continue
 
             source = source_list[idx]