Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Punycode speed improvements #20

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ int main(int argc, char* argv[]) {
size_t runs = 5;

Url::Url base_url(base);
std::string full = Url::Url(relative).relative_to(base_url).str();
Url::Url full_url = Url::Url(relative).relative_to(base_url);
std::string full = full_url.str();

bench("parse", count, runs, [full]() {
Url::Url parsed(full);
Expand All @@ -65,4 +66,8 @@ int main(int argc, char* argv[]) {
bench("parse + punycode", count, runs, [full]() {
Url::Url(full).punycode();
});

bench("punycode + unpunycode", count, runs, [full_url]() mutable {
full_url.punycode().unpunycode();
});
}
20 changes: 18 additions & 2 deletions include/punycode.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ namespace Url

// The highest codepoint in unicode
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
//Utf8::MAX_CODEPOINT;
//std::numeric_limits<punycode_uint>::max();

/**
* Punycode the utf-8-encoded begin->end and append it to str.
*/
std::string& encode(std::string& str, std::string::const_iterator begin,
std::string::const_iterator end);

/**
* Replace utf-8-encoded str into punycode.
Expand All @@ -67,6 +71,12 @@ namespace Url
*/
std::string encode(const std::string& str);

/**
* Append the utf-8-version of the punycoded string between begin and end to str.
*/
std::string& decode(std::string& str, std::string::const_iterator begin,
std::string::const_iterator end);

/**
* Replace punycoded str into utf-8-encoded.
*/
Expand All @@ -82,6 +92,12 @@ namespace Url
*/
bool needsPunycoding(const std::string& str);

/**
* Determine if the characters between these two iterators needs punycoding.
*/
bool needsPunycoding(const std::string::const_iterator& begin,
const std::string::const_iterator& end);

/**
* Internal function for calculating bias.
*/
Expand Down
73 changes: 49 additions & 24 deletions src/punycode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ namespace Url
{

std::string& Punycode::encode(std::string& str)
{
std::string output;
encode(output, str.cbegin(), str.cend());
return str = output;
}

std::string& Punycode::encode(std::string& output,
std::string::const_iterator begin,
std::string::const_iterator end)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
//
Expand All @@ -18,25 +27,26 @@ namespace Url
punycode_uint n = INITIAL_N;
punycode_uint delta = 0;
punycode_uint bias = INITIAL_BIAS;
std::string output;

// let h = b = the number of basic code points in the input
size_t h = 0;
size_t b = 0;

// Accumulate the non-basic codepoints
std::vector<punycode_uint> codepoints;
for (auto it = str.cbegin(); it != str.cend(); )
while (begin != end)
{
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
Utf8::codepoint_t value = Utf8::readCodepoint(begin, end);
if (value < 0x80)
{
// copy them to the output in order
output.append(1, static_cast<char>(value));
++h;
++b;
}
codepoints.push_back(value);
}

// let h = b = the number of basic code points in the input
size_t h = output.size();
size_t b = h;

// copy a delimiter if b > 0
if (b > 0)
{
Expand Down Expand Up @@ -125,9 +135,8 @@ namespace Url
++delta;
++n;
}

str.assign(output);
return str;

return output;
}

std::string Punycode::encode(const std::string& str)
Expand All @@ -137,7 +146,8 @@ namespace Url
return result;
}

std::string& Punycode::decode(std::string& str)
std::string& Punycode::decode(std::string& str, std::string::const_iterator begin,
std::string::const_iterator end)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
//
Expand All @@ -150,15 +160,18 @@ namespace Url
punycode_uint bias = INITIAL_BIAS;
std::vector<punycode_uint> codepoints;

size_t index = str.rfind('-');
if (index == std::string::npos)
std::string::const_iterator index = end;
for (; index != begin; --index)
{
index = 0;
if (*index == '-')
{
break;
}
}

// consume all code points before the last delimiter (if there is one)
// and copy them to output, fail on any non-basic code point
for (auto it = str.begin(); it != (str.begin() + index); ++it)
for (auto it = begin; it != index; ++it)
{
if (static_cast<unsigned char>(*it) > 127U)
{
Expand All @@ -169,13 +182,13 @@ namespace Url

// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
if (index > 0)
if (index != begin)
{
index += 1;
++index;
}

// while the input is not exhausted do begin
for (auto it = (str.begin() + index); it != str.end(); ++it)
for (auto it = index; it != end; ++it)
{
// let oldi = i
// let w = 1
Expand All @@ -186,7 +199,7 @@ namespace Url
for (punycode_uint k = BASE; ; k += BASE, ++it)
{
// consume a code point, or fail if there was none to consume
if (it == str.end())
if (it == end)
{
throw std::invalid_argument("Premature termination");
}
Expand Down Expand Up @@ -275,16 +288,22 @@ namespace Url
++i;
}

std::string output;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
Utf8::writeCodepoint(output, *it);
Utf8::writeCodepoint(str, *it);
}
str.assign(output);

return str;
}

std::string& Punycode::decode(std::string& str)
{
std::string output;
decode(output, str.cbegin(), str.cend());
str.assign(output);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing about assignment operator.

return str;
}

std::string Punycode::decode(const std::string& str)
{
std::string result(str);
Expand All @@ -293,10 +312,16 @@ namespace Url
}

bool Punycode::needsPunycoding(const std::string& str)
{
return needsPunycoding(str.cbegin(), str.cend());
}

bool Punycode::needsPunycoding(const std::string::const_iterator& begin,
const std::string::const_iterator& end)
{
return std::any_of(
str.begin(),
str.end(),
begin,
end,
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
}

Expand Down
95 changes: 55 additions & 40 deletions src/url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -709,33 +709,36 @@ namespace Url

std::string encoded;

size_t start = 0;
size_t end = host_.find('.');
while(true)
auto last = host_.cbegin();
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
{
std::string segment = host_.substr(start, end - start);
if (Punycode::needsPunycoding(segment))
if (*it == '.')
{
encoded.append("xn--");
encoded.append(Punycode::encode(segment));
}
else
{
encoded.append(segment);
}
if (Punycode::needsPunycoding(last, it))
{
encoded.append("xn--");
Punycode::encode(encoded, last, it);
}
else
{
encoded.append(last, it);
}

if (end == std::string::npos)
{
break;
}
else
{
encoded.append(1, '.');
start = end + 1;
end = host_.find('.', start);
last = it + 1;
}
}

if (Punycode::needsPunycoding(last, host_.cend()))
{
encoded.append("xn--");
Punycode::encode(encoded, last, host_.cend());
}
else
{
encoded.append(last, host_.cend());
}

host_.assign(encoded);

return *this;
Expand All @@ -744,36 +747,48 @@ namespace Url
Url& Url::unpunycode()
{
std::string unencoded;
std::string prefix;

size_t start = 0;
size_t end = host_.find('.');
while(true)
auto last = host_.cbegin();
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
{
std::string segment = host_.substr(start, end - start);
if (segment.substr(0, 4).compare("xn--") == 0)
if (*it == '.')
{
segment = segment.substr(4);
unencoded.append(Punycode::decode(segment));
}
else
{
unencoded.append(segment);
}
// Starts with 'xn--'
size_t distance = it - last;
if (distance > 4)
{
prefix.assign(last, last + 4);
if (prefix == "xn--")
{
Punycode::decode(unencoded, last + 4, it);
unencoded.append(1, '.');
last = it + 1;
continue;
}
}

if (end == std::string::npos)
{
break;
unencoded.append(last, it);
unencoded.append(1, '.');
last = it + 1;
}
else
}

// Last segment
size_t distance = host_.cend() - last;
if (distance > 4)
{
prefix.assign(last, last + 4);
if (prefix == "xn--")
{
unencoded.append(1, '.');
start = end + 1;
end = host_.find('.', start);
Punycode::decode(unencoded, last + 4, host_.cend());
host_.assign(unencoded);
return *this;
}
}

unencoded.append(last, host_.cend());
host_.assign(unencoded);

return *this;
}

Expand Down
12 changes: 12 additions & 0 deletions test/test-url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1108,6 +1108,18 @@ TEST(DefragTest, Defrag)
Url::Url("http://foo.com/path#fragment").defrag().str());
}

TEST(PunycodeTest, UnpunycodeShortIdentifierAtEnd)
{
std::string example("http://www.xn-/");
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
}

TEST(PunycodeTest, UnpunycodeShortIdentifierAtStart)
{
std::string example("http://xn-.com/");
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
}

TEST(PunycodeTest, German)
{
std::string unencoded("http://www.kündigen.de/");
Expand Down