Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core-clp): Add BoundedReader to prevent out-of-bound reads in segmented input streams. #624

Merged
merged 15 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,8 @@ set(SOURCE_FILES_unitTest
src/clp/clp/run.hpp
src/clp/clp/utils.cpp
src/clp/clp/utils.hpp
src/clp/CheckpointReader.cpp
src/clp/CheckpointReader.hpp
src/clp/CurlDownloadHandler.cpp
src/clp/CurlDownloadHandler.hpp
src/clp/CurlEasyHandle.hpp
Expand Down Expand Up @@ -549,6 +551,7 @@ set(SOURCE_FILES_unitTest
tests/LogSuppressor.hpp
tests/test-Array.cpp
tests/test-BufferedFileReader.cpp
tests/test-CheckpointReader.cpp
tests/test-clp_s-end_to_end.cpp
tests/test-EncodedVariableInterpreter.cpp
tests/test-encoding_methods.cpp
Expand Down
37 changes: 37 additions & 0 deletions components/core/src/clp/CheckpointReader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include "CheckpointReader.hpp"

namespace clp {
auto CheckpointReader::try_seek_from_begin(size_t pos) -> ErrorCode {
m_cur_pos = pos > m_checkpoint ? m_checkpoint : pos;
auto rc = m_reader->try_seek_from_begin(m_cur_pos);
if (ErrorCode_Success != rc) {
return rc;
}
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
if (m_cur_pos >= m_checkpoint) {
return ErrorCode_EndOfFile;
}
return ErrorCode_Success;
}
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved

auto CheckpointReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read)
-> ErrorCode {
if ((m_cur_pos + num_bytes_to_read) > m_checkpoint) {
num_bytes_to_read = m_checkpoint - m_cur_pos;
}

if (m_cur_pos == m_checkpoint) {
return ErrorCode_EndOfFile;
}
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved

auto rc = m_reader->try_read(buf, num_bytes_to_read, num_bytes_read);
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
m_cur_pos += num_bytes_read;
if (ErrorCode_EndOfFile == rc) {
if (0 == num_bytes_read) {
return ErrorCode_EndOfFile;
}
} else if (ErrorCode_Success != rc) {
return rc;
}
return ErrorCode_Success;
}
} // namespace clp
77 changes: 77 additions & 0 deletions components/core/src/clp/CheckpointReader.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef CLP_CHECKPOINTREADER_HPP
#define CLP_CHECKPOINTREADER_HPP

#include "ReaderInterface.hpp"

namespace clp {
/**
* CheckpointReader is a ReaderInterface designed to wrap other ReaderInterfaces and prevent users
* from reading or seeking beyond a certain point in the underlying input stream.
*
* This is useful when the underlying input stream is divided into several logical segments and we
* want to prevent a reader for an earlier segment consuming any bytes from a later segment. In
* particular, reading part of a later segment may force the reader for that later segment to seek
* backwards, which can be either inefficient or impossible for certain kinds of input streams.
*/
class CheckpointReader : public ReaderInterface {
public:
// Constructor
explicit CheckpointReader(ReaderInterface* reader, size_t checkpoint)
: m_reader(reader),
m_checkpoint(checkpoint) {
if (nullptr != m_reader) {
m_cur_pos = m_reader->get_pos();
}
if (m_cur_pos > m_checkpoint) {
throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__);
}
}
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved

// Methods implementing the ReaderInterface
/**
* Tries to get the current position of the read head in the underlying reader.
* @param pos Position of the read head in the underlying reader
* @return ErrorCode_Success on success
* @return ErrorCode_errno on failure
*/
auto try_get_pos(size_t& pos) -> ErrorCode override { return m_reader->try_get_pos(pos); }

/**
* Tries to seek to the given position, limited by the checkpoint. If pos is past the checkpoint
* then this reader seeks up to the checkpoint in the underlying stream, then returns
* ErrorCode_EndOfFile on success or ErrorCode_errno otherwise.
* @param pos
* @return ErrorCode_Success on success
* @return ErrorCode_EndOfFile on EOF or if trying to seek beyond the checkpoint
* @return ErrorCode_errno on failure
*/
auto try_seek_from_begin(size_t pos) -> ErrorCode override;

/**
* Tries to read up to a given number of bytes from the file, limited by the checkpoint. If the
* requested read would advance the read head beyond the checkpoint then the read size is
* adjusted such that the read head is advanced to the checkpoint. If a read is attempted when
* the read head is at the checkpoint then ErrorCode_EndOfFile is returned.
* @param buf
* @param num_bytes_to_read The number of bytes to try and read
* @param num_bytes_read The actual number of bytes read
* @return ErrorCode_errno on error
* @return ErrorCode_EndOfFile on EOF or trying to read after hitting checkpoint
* @return ErrorCode_Success on success
*/
auto
try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override;

auto try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str)
-> ErrorCode override {
return ErrorCode_Unsupported;
}

private:
ReaderInterface* m_reader{nullptr};
size_t m_checkpoint{};
size_t m_cur_pos{};
};
} // namespace clp

#endif // CLP_CHECKPOINTREADER_HPP
4 changes: 4 additions & 0 deletions components/core/src/clp/StringReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ ErrorCode StringReader::try_read(char* buf, size_t num_bytes_to_read, size_t& nu
}

ErrorCode StringReader::try_seek_from_begin(size_t pos) {
if (pos > input_string.size()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this only for supporting a new test case?

What you intend to do makes sense to me, but It's bit annoying that the behavior of standard seek interface allows seeking beyond the file ending position, so I feel we need some justification when we decide to change the behavior.

@kirkrodrigues any comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would classify this as a bug I'm fixing in the StringReader class so that I can use it for tests. Every other reader we have will EOF if you seek past the end from what I've seen.

Actually, the way the rest of this class is written if you first seek past the end of the input buffer it will happily let you read data beyond the end of the buffer. I.e. it is very explicitly a bug.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, The FileReader internally calls fseeko, which I believe would allow seeking beyond the end of file.

BufferedFileReader and BufferReader would return ErrorCode_Truncated, and won't update the pos at all if the pos is greater than the maximum length,

From the consistency point of view, maybe we should let it return ErrorCode_Truncated. But also wonder if the rest of your code is dependent on the current behavior.

Copy link
Member

@kirkrodrigues kirkrodrigues Dec 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • The implementations of ReaderInterface have inconsistent behaviour.
    • Some (e.g., FileReader) will rely on the lower implementation to return an error if the seek fails.
    • Some (e.g. BufferReader) will return an error if the seek is past the end, but they won't modify m_pos.
    • Some (e.g., NetworkReader) will return an error if the seek is past the end, and they will modify m_pos.
    • Each of the above returns different error codes.

I think the practical implementation is for try_seek_from_begin to:

  • try to seek until pos
  • if that's past the end of the medium (file/buffer/etc.), m_pos should be updated to just past the last byte.
  • the method should return ErrorCode_EndOfFile.

The reason to update m_pos even though we get to pos is because for some implementations like FileReader, we can't easily check what the last byte is until we seek, and if we seek up to the end of the file, we may not be able to seek backwards to the original m_pos.

If y'all agree, we should open a GH issue to refactor the existing implementations. And for this implementation, we implement the proposal above (basically what Devin's implemented---although on error, we're still updating m_cur_pos, even tough the error may not be EOF?).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, the standard behavior sounds good to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense to me.

Also went and updated BoundedReader to only update m_curr_pos on error if that error is EOF.

this->pos = input_string.size();
return ErrorCode_EndOfFile;
}
this->pos = pos;
return ErrorCode_Success;
}
Expand Down
94 changes: 94 additions & 0 deletions components/core/tests/test-CheckpointReader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include <string>

#include <Catch2/single_include/catch2/catch.hpp>

#include "../src/clp/CheckpointReader.hpp"
#include "../src/clp/ErrorCode.hpp"
#include "../src/clp/StringReader.hpp"

TEST_CASE("Test Checkpoint Reader", "[CheckpointReader]") {
constexpr char cTestString[]{"0123456789"};
constexpr size_t cTestStringLen{std::char_traits<char>::length(cTestString)};

SECTION("CheckpointReader does not support try_read_to_delimiter") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, cTestStringLen};
std::string tmp;
REQUIRE(clp::ErrorCode_Unsupported
== checkpoint_reader.try_read_to_delimiter('0', false, false, tmp));
}

SECTION("CheckpointReader does not allow reads beyond end of underlying stream.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, cTestStringLen + 1};
char buf[cTestStringLen + 1];
size_t num_bytes_read{};
auto rc = checkpoint_reader.try_read(buf, cTestStringLen + 1, num_bytes_read);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(num_bytes_read == cTestStringLen);
REQUIRE(cTestStringLen == string_reader.get_pos());
REQUIRE(cTestStringLen == checkpoint_reader.get_pos());
}

SECTION("CheckpointReader does not allow reads beyond checkpoint.") {
haiqi96 marked this conversation as resolved.
Show resolved Hide resolved
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, 1};
char buf[cTestStringLen];
size_t num_bytes_read{};
auto rc = checkpoint_reader.try_read(buf, cTestStringLen, num_bytes_read);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(1 == num_bytes_read);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == checkpoint_reader.get_pos());
}

SECTION("CheckpointReader does allow reads before checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, 1};
haiqi96 marked this conversation as resolved.
Show resolved Hide resolved
char buf{};
size_t num_bytes_read{};
auto rc = checkpoint_reader.try_read(&buf, 1, num_bytes_read);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(1 == num_bytes_read);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == checkpoint_reader.get_pos());
}

SECTION("CheckpointReader does not allow seeks beyond end of underlying stream.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, cTestStringLen + 1};
auto rc = checkpoint_reader.try_seek_from_begin(cTestStringLen + 1);
REQUIRE(clp::ErrorCode_EndOfFile == rc);
REQUIRE(cTestStringLen == string_reader.get_pos());
REQUIRE(cTestStringLen == checkpoint_reader.get_pos());
}

SECTION("CheckpointReader does not allow seeks beyond checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, 1};
char buf[cTestStringLen];
size_t num_bytes_read{};
auto rc = checkpoint_reader.try_seek_from_begin(cTestStringLen);
REQUIRE(clp::ErrorCode_EndOfFile == rc);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == checkpoint_reader.get_pos());
}

SECTION("CheckpointReader does allow seeks before checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::CheckpointReader checkpoint_reader{&string_reader, 2};
char buf[cTestStringLen];
size_t num_bytes_read{};
auto rc = checkpoint_reader.try_seek_from_begin(1);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == checkpoint_reader.get_pos());
}
}
Loading