From b60f51e7be22a3c0d3c8218fd1f643aca76357a5 Mon Sep 17 00:00:00 2001 From: Sasha Rahlin Date: Thu, 25 Apr 2024 15:13:57 -0500 Subject: [PATCH] Add a read-only current_file attribute for G3MultiFileWriter This is useful for subclassing this writer for, e.g. writing files to the grid. --- core/src/G3MultiFileWriter.cxx | 40 +++++++++++++++++++++++++++++----- core/tests/multifileio.py | 16 ++++++++++---- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/core/src/G3MultiFileWriter.cxx b/core/src/G3MultiFileWriter.cxx index e8acc4af..ad9bbfec 100644 --- a/core/src/G3MultiFileWriter.cxx +++ b/core/src/G3MultiFileWriter.cxx @@ -20,11 +20,13 @@ class G3MultiFileWriter : public G3Module { boost::python::object divide_on = boost::python::object()); virtual ~G3MultiFileWriter(); void Process(G3FramePtr frame, std::deque &out); + std::string CurrentFile() { return current_filename_; } private: bool CheckNewFile(G3FramePtr frame); std::string filename_; boost::python::object filename_callback_; + std::string current_filename_; size_t size_limit_; std::vector always_break_on_; @@ -141,6 +143,8 @@ G3MultiFileWriter::CheckNewFile(G3FramePtr frame) } + current_filename_ = filename; + if (boost::algorithm::ends_with(filename, ".gz")) stream_.push(boost::iostreams::gzip_compressor()); if (boost::algorithm::ends_with(filename, ".bz2")) { @@ -201,8 +205,34 @@ void G3MultiFileWriter::Process(G3FramePtr frame, std::deque &out) out.push_back(frame); } -EXPORT_G3MODULE("core", G3MultiFileWriter, (init >(args("filename", "size_limit", "divide_on"))), - "Writes frames to disk into a sequence of files. Once a file exceeds the number of bytes specified in size_limit, it will start a new file. Files are named based on filename. If passed a string for filename with a printf-style specifier, that specifier will be replaced by a zero-indexed sequence number. For example, outfile-%03u.g3.gz would produce a sequence of files named outfile-000.g3.gz, outfile-001.g3.gz, etc. Alternatively, you can pass a callable that is passed the first frame in the new file and the sequence number and returns a path to the new file. Any frames besides Timepoint and Scan frames have the most recent frame of each type prepended to all new files.\n\n" - "More complex behavior can be obtained with the optional divide_on argument. This can be an iterable of frame types (e.g. [core.G3FrameType.Observation]) or a callable. In the iterable case, the presence of any frame with a type in the list will cause the creation of a new file even if the file size threshold has not yet been met. This is useful to create files based on, for example, observation boundaries. For more flexibility, you can also pass a python callable as divide_on. This callable will be passed each frame in turn. If it returns True (or something with positive truth-value), a new file will be started at that frame." -); - +PYBINDINGS("core") { + using namespace boost::python; + + class_, boost::shared_ptr, + boost::noncopyable>("G3MultiFileWriter", + "Writes frames to disk into a sequence of files. Once a file exceeds " + "the number of bytes specified in size_limit, it will start a new file. " + "Files are named based on filename. If passed a string for filename " + "with a printf-style specifier, that specifier will be replaced by a " + "zero-indexed sequence number. For example, outfile-%03u.g3.gz would " + "produce a sequence of files named outfile-000.g3.gz, outfile-001.g3.gz, " + "etc. Alternatively, you can pass a callable that is passed the first " + "frame in the new file and the sequence number and returns a path to " + "the new file. Any frames besides Timepoint and Scan frames have the " + "most recent frame of each type prepended to all new files.\n\n" + "More complex behavior can be obtained with the optional divide_on " + "argument. This can be an iterable of frame types (e.g. " + "[core.G3FrameType.Observation]) or a callable. In the iterable case, " + "the presence of any frame with a type in the list will cause the " + "creation of a new file even if the file size threshold has not yet " + "been met. This is useful to create files based on, for example, " + "observation boundaries. For more flexibility, you can also pass a " + "python callable as divide_on. This callable will be passed each " + "frame in turn. If it returns True (or something with positive " + "truth-value), a new file will be started at that frame.", + init >((arg("filename"), + arg("size_limit"), arg("divide_on")=object()))) + .def_readonly("current_file", &G3MultiFileWriter::CurrentFile) + .def_readonly("__g3module__", true) + ; +} diff --git a/core/tests/multifileio.py b/core/tests/multifileio.py index 4cec7e1c..134efbe7 100755 --- a/core/tests/multifileio.py +++ b/core/tests/multifileio.py @@ -19,11 +19,19 @@ def addinfo(fr): fr['count'] = n n += 1 pipe.Add(addinfo) + +class checkwriter(core.G3MultiFileWriter): + def __call__(self, frame): + if frame.type == core.G3FrameType.EndProcessing: + assert os.path.exists(self.current_file) + super().__call__(frame) + pipe.Add(lambda fr: fr.type != core.G3FrameType.PipelineInfo) # Avoid extra frames that complicate accounting -pipe.Add(core.G3MultiFileWriter, filename='multitest-%02u.g3', size_limit=20*1024) -pipe.Add(core.G3MultiFileWriter, filename=lambda frame,seq: 'multitest2-%02d.g3' % seq, size_limit=20*1024) -pipe.Add(core.G3MultiFileWriter, filename='multitest3-%02u.g3', size_limit=20*1024, divide_on=[core.G3FrameType.Timepoint]) -pipe.Add(core.G3MultiFileWriter, filename='multitest4-%02u.g3', size_limit=2000000*1024, divide_on=lambda fr: fr['count'] % 200 == 0) +pipe.Add(checkwriter, filename='multitest-%02u.g3', size_limit=20*1024) +pipe.Add(checkwriter, filename=lambda frame,seq: 'multitest2-%02d.g3' % seq, size_limit=20*1024) +pipe.Add(checkwriter, filename='multitest3-%02u.g3', size_limit=20*1024, divide_on=[core.G3FrameType.Timepoint]) +pipe.Add(checkwriter, filename='multitest4-%02u.g3', size_limit=2000000*1024, divide_on=lambda fr: fr['count'] % 200 == 0) + pipe.Run() # Check that various ways of splitting produce the right number of files