Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option to exclude some characters to be decoded #22

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ require 'htmlentities'
coder = HTMLEntities.new
string = "élan"
coder.decode(string) # => "élan"
string = "&lt;&eacute;lan&#x3E;", exclude: ['<', '>']
coder.decode(string) # => "&lt;élan&#x3E;"
```

### Encoding
Expand Down
4 changes: 2 additions & 2 deletions lib/htmlentities.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def initialize(flavor='xhtml1')
#
# Unknown named entities will not be converted
#
def decode(source)
(@decoder ||= Decoder.new(@flavor)).decode(source)
def decode(source, options = {})
(@decoder ||= Decoder.new(@flavor)).decode(source, options)
end

#
Expand Down
42 changes: 30 additions & 12 deletions lib/htmlentities/decoder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,36 @@ def initialize(flavor)
@entity_regexp = entity_regexp
end

def decode(source)
prepare(source).gsub(@entity_regexp){
if $1 && codepoint = @map[$1]
[codepoint].pack('U')
elsif $2
[$2.to_i(10)].pack('U')
elsif $3
[$3.to_i(16)].pack('U')
else
$&
end
}
def decode(source, options = {})
excluded = options[:exclude] ||= []
if excluded.count
prepare(source).gsub(@entity_regexp){
if $1 && codepoint = @map[$1]
res = [codepoint].pack('U')
excluded.include?(res) ? $& : res
elsif $2
res = [$2.to_i(10)].pack('U')
excluded.include?(res) ? $& : res
elsif $3
res = [$3.to_i(16)].pack('U')
excluded.include?(res) ? $& : res
else
$&
end
}
else
prepare(source).gsub(@entity_regexp){
if $1 && codepoint = @map[$1]
[codepoint].pack('U')
elsif $2
[$2.to_i(10)].pack('U')
elsif $3
[$3.to_i(16)].pack('U')
else
$&
end
}
end
end

private
Expand Down
67 changes: 65 additions & 2 deletions test/decoding_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ def setup
@entities = [:xhtml1, :html4, :expanded].map{ |a| HTMLEntities.new(a) }
end

def assert_decode(expected, input)
def assert_decode(expected, input, options = {})
@entities.each do |coder|
assert_equal expected, coder.decode(input)
assert_equal expected, coder.decode(input, options)
end
end

Expand All @@ -19,31 +19,60 @@ def test_should_decode_basic_entities
assert_decode '"', '&quot;'
end

def test_should_not_decode_excluded_basic_entities
assert_decode '&amp;', '&amp;', exclude: ['&']
assert_decode '&lt;', '&lt;', exclude: ['<']
assert_decode '&quot;', '&quot;', exclude: ['"']
end

def test_should_decode_extended_named_entities
assert_decode '±', '&plusmn;'
assert_decode 'ð', '&eth;'
assert_decode 'Œ', '&OElig;'
assert_decode 'œ', '&oelig;'
end

def test_should_not_decode_excluded_extended_named_entities
assert_decode '&plusmn;', '&plusmn;', exclude: ['±']
assert_decode '&eth;', '&eth;', exclude: ['ð']
assert_decode '&OElig;', '&OElig;', exclude: ['Œ']
assert_decode '&oelig;', '&oelig;', exclude: ['œ']
end

def test_should_decode_decimal_entities
assert_decode '“', '&#8220;'
assert_decode '…', '&#8230;'
assert_decode ' ', '&#32;'
end

def test_should_not_decode_excluded_decimal_entities
assert_decode '&#8220;', '&#8220;', exclude: ['“']
assert_decode '&#8230;', '&#8230;', exclude: ['…']
assert_decode '&#32;', '&#32;', exclude: [' ']
end

def test_should_decode_hexadecimal_entities
assert_decode '−', '&#x2212;'
assert_decode '—', '&#x2014;'
assert_decode '`', '&#x0060;'
assert_decode '`', '&#x60;'
end

def test_should_not_decode_excluded_hexadecimal_entities
assert_decode '&#x2212;', '&#x2212;', exclude: ['−']
assert_decode '&#x2014;', '&#x2014;', exclude: ['—']
assert_decode '&#x0060;', '&#x0060;', exclude: ['`']
assert_decode '&#x60;', '&#x60;', exclude: ['`']
end

def test_should_not_mutate_string_being_decoded
original = "&lt;&#163;"
input = original.dup

HTMLEntities.new.decode(input)
assert_equal original, input

HTMLEntities.new.decode(input, excluded: ['a'])
assert_equal original, input
end

Expand All @@ -59,16 +88,38 @@ def test_should_decode_text_with_mix_of_entities
)
end

def test_should_decode_text_with_mix_of_entities_only_not_excluded
# Just a random headline - I needed something with accented letters.
assert_decode(
'Le tabac pourrait bient&ocirc;t être banni dans tous les lieux publics en France',
'Le tabac pourrait bient&ocirc;t &#234;tre banni dans tous les lieux publics en France',
exclude: ['ô']
)
assert_decode(
'"bientôt" & &#25991;字',
'&quot;bient&ocirc;t&quot; &amp; &#25991;&#x5b57;',
exclude: ['文']
)
assert_decode(
'Le tabac pourrait bientôt être banni dans tous les lieux publics en France',
'Le tabac pourrait bient&ocirc;t &#234;tre banni dans tous les lieux publics en France',
exclude: ['文']
)
end

def test_should_decode_empty_string
assert_decode '', ''
assert_decode '', '', exclude: ['a']
end

def test_should_skip_unknown_entity
assert_decode '&bogus;', '&bogus;'
assert_decode '&bogus;', '&bogus;', exclude: ['a']
end

def test_should_decode_double_encoded_entity_once
assert_decode '&amp;', '&amp;amp;'
assert_decode '&amp;', '&amp;amp;', exclude: ['a']
end

# Faults found and patched by Moonwolf
Expand All @@ -78,13 +129,25 @@ def test_should_decode_full_hexadecimal_range
end
end

def test_should_not_decode_full_hexadecimal_range_if_excluded
(0..127).each do |codepoint|
assert_decode "&\#x#{codepoint.to_s(16)};", "&\#x#{codepoint.to_s(16)};", exclude: [[codepoint].pack('U')]
end
end

# Reported by Dallas DeVries and Johan Duflost
def test_should_decode_named_entities_reported_as_missing_in_3_0_1
assert_decode [178].pack('U'), '&sup2;'
assert_decode [8226].pack('U'), '&bull;'
assert_decode [948].pack('U'), '&delta;'
end

def test_should_not_decode_named_entities_reported_as_missing_in_3_0_1_if_excluded
assert_decode '&sup2;', '&sup2;', exclude: [[178].pack('U')]
assert_decode '&bull;', '&bull;', exclude: [[8226].pack('U')]
assert_decode '&delta;', '&delta;', exclude: [[948].pack('U')]
end

# Reported by ckruse
def test_should_decode_only_first_element_in_masked_entities
input = '&amp;#3346;'
Expand Down