Skip to content

Commit

Permalink
fix memory issues (#257)
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir authored Sep 9, 2024
1 parent c990f9c commit 3dde884
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 16 deletions.
10 changes: 5 additions & 5 deletions src/bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,19 +253,19 @@ BPETokenizerImpl::BPETokenizerImpl(
Vocab new_vocab = vocab;

for (size_t i = 0; i < merges.size(); i++) {
auto pair = merges.at(i);
auto& pair = merges.at(i);
auto id_pair = std::make_pair(vocab.at(pair.first), vocab.at(pair.second));
new_merges[id_pair] = {i, vocab.at(pair.first + pair.second)};
new_vocab.erase(pair.first + pair.second);
}

this->m_vocab = new_vocab;
this->m_merges = new_merges;
m_vocab = std::move(new_vocab);
m_merges = std::move(new_merges);

m_trie = std::make_unique<Trie>();
for(const auto& word: new_vocab) {
for(const auto& word: m_vocab) {
const auto token = std::vector<unsigned char>(word.first.begin(), word.first.end());
m_trie->add(token, word.second);
}
m_cache.reserve(cache_capacity);
}
}
2 changes: 1 addition & 1 deletion src/regex_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class RegexSplit : public ov::op::Op {
mutable std::shared_ptr<PCRE2Wrapper> m_search_pattern_pcre2;
mutable std::shared_ptr<std::set<std::string>> m_skip_tokens;
mutable std::string m_behaviour = "remove";
mutable SplitMode m_split_mode;
mutable SplitMode m_split_mode = SplitMode::REMOVED;
bool m_invert = false;
int m_max_splits = -1;

Expand Down
8 changes: 4 additions & 4 deletions src/utf8_validate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
// UTF-8 code points should not intersect:
// if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
// even if it has a valid bit mask.
const uint64_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
uint64_t utf_code_point;
size_t bytes_to_consume; // Number of additional 0b10xxxxxx bytes to consume to produce a valid UTF8 symbol.
size_t num_bytes;
const uint32_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
uint32_t utf_code_point;
uint32_t bytes_to_consume; // Number of additional 0b10xxxxxx bytes to consume to produce a valid UTF8 symbol.
uint32_t num_bytes;

size_t out_idx = begins[0];
for (size_t i = 0; i < begins_shape[0]; i++) {
Expand Down
18 changes: 12 additions & 6 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,6 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(m_compiled, NULL);
PCRE2_SIZE subject_length = orig_str.size();

// Usually found pattern is replaced by shorter string, but set 3 times more space for safety.
// Allocate dynamically since lenght depends dynamically on the lenght of input string.
// Allocated memory will be freed at the exit from function.
auto buffer = (PCRE2_UCHAR*) std::malloc(sizeof(PCRE2_UCHAR) * subject_length * 3);

// Check if the string matches the pattern
int match_result = pcre2_match(
m_compiled,
Expand All @@ -272,7 +267,17 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
pcre2_match_data_free(match_data);
return orig_str;
}


// Usually found pattern is replaced by shorter string, but set 3 times more space for safety.
// Allocate dynamically since lenght depends dynamically on the lenght of input string.
// Allocated memory will be freed at the exit from function.
auto buffer = (PCRE2_UCHAR*) std::malloc(sizeof(PCRE2_UCHAR) * subject_length * 3);
if (buffer == nullptr) {
std::cerr << "Memory allocation failed" << std::endl;
pcre2_match_data_free(match_data);
return orig_str;
}

int rc = pcre2_substitute(
m_compiled,
(PCRE2_SPTR) orig_str.c_str(), orig_str.size(),
Expand All @@ -292,6 +297,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
std::cerr << "PCRE2 substitution failed with error code " << rc << std::endl;
}
pcre2_match_data_free(match_data);
std::free(buffer);
return orig_str;
}
auto res = std::string(reinterpret_cast<char*>(buffer), subject_length);
Expand Down

0 comments on commit 3dde884

Please sign in to comment.