Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse GFM Extended autolinks #71

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f8de074
Get basic GFM extended autolinks parsing
stephenreddek Oct 20, 2020
477e2e6
Run ete tests for the basic GFM autolink scenarios
stephenreddek Oct 21, 2020
c63e800
support trailing parenthesis and entity references
stephenreddek Oct 21, 2020
51e03f8
update spec results with handling trailing
stephenreddek Oct 21, 2020
ea477f2
Handling multiple trailing parenthesis
stephenreddek Oct 21, 2020
e92b08f
Make extended autolink processing lower in precedence than standard l…
stephenreddek Oct 23, 2020
cabbb93
Basic support for email autolinks
stephenreddek Oct 23, 2020
3745fd4
Require a period in the domain portion of email autolinks
stephenreddek Oct 23, 2020
0223c13
support trailing / on urls without subpages
stephenreddek Oct 30, 2020
144ceb4
add a test for when the link is nested
stephenreddek Oct 30, 2020
56d93f0
update ete
stephenreddek Oct 30, 2020
e43183d
filter out inner autolinks on html anchor nodes
stephenreddek Feb 12, 2021
953cb50
Handle when there is a period inside an unmatched parenthesis in an a…
stephenreddek Feb 13, 2021
8b5f2a1
Merge branch 'master' of https://github.com/dillonkearns/elm-markdown…
stephenreddek Feb 13, 2021
fea447a
update the ete results after merge
stephenreddek Feb 13, 2021
d6ea1c0
elm-format
stephenreddek Feb 13, 2021
7b4478b
remove filter out anchors logic.
stephenreddek Feb 13, 2021
158dbab
keep a newline at the end of test inputs
stephenreddek Feb 13, 2021
98b58f9
Skip autolinks when they're inside anchor tags
stephenreddek Feb 13, 2021
0d820fc
limit parsing the text portions of inline html
stephenreddek Feb 13, 2021
9bfa6e6
Add tests around underscore emphasis in email autolinks
stephenreddek Feb 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions spec-results.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,8 @@
601,
602,
603,
604,
605,
606,
607,
608
606
],
"Backslash escapes": [
298,
Expand Down Expand Up @@ -558,11 +555,8 @@
601,
602,
603,
604,
605,
606,
607,
608
606
],
"Backslash escapes": [
298,
Expand Down Expand Up @@ -1052,6 +1046,18 @@
28,
29
],
"[extension] Autolinks": [
621,
622,
623,
624,
625,
626,
627,
629,
630,
631
],
"[extension] Strikethrough": [
492
],
Expand Down
269 changes: 258 additions & 11 deletions src/Markdown/InlineParser.elm
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ type Meaning
| EmphasisToken Char { leftFringeRank : Int, rightFringeRank : Int }
| SoftLineBreakToken
| HardLineBreakToken
| ExtendedAutolink
| EmailAutolink


findToken : (Token -> Bool) -> List Token -> Maybe ( Token, List Token, List Token )
Expand Down Expand Up @@ -250,6 +252,8 @@ tokenize rawText =
|> mergeByIndex (findHardBreakTokens rawText)
|> mergeByIndex (findAngleBracketLTokens rawText)
|> mergeByIndex (findAngleBracketRTokens rawText)
|> mergeByIndex (findExtendedAutolinkTokens rawText)
|> mergeByIndex (findEmailAutolinkTokens rawText)


{-| Merges two sorted sequences into a sorted sequence
Expand Down Expand Up @@ -685,6 +689,104 @@ regMatchToLinkImageCloseToken regMatch =



-- GFM Auto Link Tokens


findExtendedAutolinkTokens : String -> List Token
findExtendedAutolinkTokens str =
Regex.find extendedAutoLinkRegex str
|> List.filterMap regMatchToExtendedAutolinkToken


extendedAutoLinkRegex : Regex
extendedAutoLinkRegex =
-- what if we do this without the negative lookbehind and just make it a sub match?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolve this question

Regex.fromString "(?<=^|\\s|\\*|_|~|\\()(?:(?:https?://)|(?:www\\.))[a-z0-9A-Z_-]+(?:\\.[a-z0-9A-Z_-]+)*(?:/([^\\s<]*))?"
|> Maybe.withDefault Regex.never


extendedAutoLinkTrailingPunctuationRegex : Regex
extendedAutoLinkTrailingPunctuationRegex =
--should this use the isPunctuation helper?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolve

Regex.fromString "[?!\\.,:*_~]+$"
|> Maybe.withDefault Regex.never


extendedAutoLinkTrailingEntityReferenceRegex : Regex
extendedAutoLinkTrailingEntityReferenceRegex =
Regex.fromString "(&[a-zA-Z0-9]+;)+$"
|> Maybe.withDefault Regex.never


regMatchToExtendedAutolinkToken : Regex.Match -> Maybe Token
regMatchToExtendedAutolinkToken regMatch =
let
lengthOfTrailingPunctuation =
regMatch.match
|> Regex.find extendedAutoLinkTrailingPunctuationRegex
|> List.head
|> Maybe.map (.match >> String.length)
|> Maybe.withDefault 0

lengthOfUnmatchedParenthesis =
if String.endsWith ")" regMatch.match then
Basics.max 0 (List.length (String.indexes ")" regMatch.match) - List.length (String.indexes "(" regMatch.match))

else
0

lengthOfTrailingEntityReferences =
regMatch.match
|> Regex.find extendedAutoLinkTrailingEntityReferenceRegex
|> List.head
|> Maybe.map (.match >> String.length)
|> Maybe.withDefault 0
in
Just
{ index = regMatch.index
, length = String.length regMatch.match - lengthOfTrailingPunctuation - lengthOfUnmatchedParenthesis - lengthOfTrailingEntityReferences
, meaning = ExtendedAutolink
}



-- GFM Auto Link Tokens


findEmailAutolinkTokens : String -> List Token
findEmailAutolinkTokens str =
Regex.find emailAutoLinkRegex str
|> List.filterMap regMatchToEmailAutolinkToken


emailAutoLinkRegex : Regex
emailAutoLinkRegex =
Regex.fromString "(?<=^|\\s|\\*|_|~|\\()[a-zA-Z0-9\\._+-]+@[a-zA-Z0-9_-]+((\\.[a-zA-Z0-9_-]+)+)"
|> Maybe.withDefault Regex.never


regMatchToEmailAutolinkToken : Regex.Match -> Maybe Token
regMatchToEmailAutolinkToken regMatch =
let
lastCharacter =
regMatch.match
|> String.right 1
|> String.uncons
|> Maybe.map Tuple.first
in
case Maybe.map Char.isAlphaNum lastCharacter of
Just True ->
Just
{ index = regMatch.index
, length = String.length regMatch.match
, meaning = EmailAutolink
}

_ ->
Nothing



-- Angle Brackets Tokens


Expand Down Expand Up @@ -1142,6 +1244,68 @@ autolinkToMatch (Match match) =
Result.Err (Match match)


extendedAutolinkToMatch : String -> Token -> Maybe Match
extendedAutolinkToMatch rawText token =
let
start =
token.index

end =
token.index + token.length

text =
String.slice token.index end rawText

url =
withProtocol text
in
if Regex.contains urlRegex url then
{ type_ = AutolinkType ( text, encodeUrl url )
, start = start
, end = end
, textStart = 0
, textEnd = 0
, text = ""
, matches = []
}
|> Match
|> Just

else
Nothing


emailAutolinkToMatch : String -> Token -> Maybe Match
emailAutolinkToMatch rawText token =
let
start =
token.index

end =
token.index + token.length

text =
String.slice token.index end rawText

url =
"mailto:" ++ text
in
if Regex.contains urlRegex url then
{ type_ = AutolinkType ( text, encodeUrl url )
, start = start
, end = end
, textStart = 0
, textEnd = 0
, text = ""
, matches = []
}
|> Match
|> Just

else
Nothing



-- From http://spec.commonmark.org/dingus/commonmark.js

Expand Down Expand Up @@ -1254,6 +1418,14 @@ htmlElementTTM remaining tokens matches references rawText =

Just ( closeToken, innerTokens, newTail ) ->
let
withoutInnerLinksIfAnchor =
case htmlModel of
HtmlParser.Element "a" _ _ ->
List.filter (isExtendedAutoLink >> not) innerTokens

_ ->
innerTokens

newMatch =
tokenPairToMatch
references
Expand All @@ -1262,7 +1434,7 @@ htmlElementTTM remaining tokens matches references rawText =
(HtmlType htmlModel)
token
closeToken
innerTokens
withoutInnerLinksIfAnchor
in
htmlElementTTM newTail tokens (newMatch :: matches) references rawText

Expand Down Expand Up @@ -1299,12 +1471,17 @@ voidHtmlTags =

isCloseToken : HtmlModel -> Token -> Bool
isCloseToken htmlModel token =
--case token.meaning of
-- HtmlToken False htmlModel_ ->
-- htmlModel.tag == htmlModel_.tag
--
-- _ ->
False
case token.meaning of
HtmlToken NotOpening htmlModel_ ->
case ( htmlModel, htmlModel_ ) of
( HtmlParser.Element firstTag _ _, HtmlParser.Element secondTag _ _ ) ->
firstTag == secondTag

_ ->
False

_ ->
False



Expand All @@ -1316,7 +1493,7 @@ linkImageTypeTTM : List Token -> List Token -> List Match -> References -> Strin
linkImageTypeTTM remaining tokens matches references rawText =
case remaining of
[] ->
emphasisTTM (List.reverse tokens) [] matches references rawText
extendedAutolinkTTM (List.reverse tokens) [] matches references rawText

token :: tokensTail ->
case token.meaning of
Expand Down Expand Up @@ -1607,13 +1784,22 @@ refRegexToMatch matchModel references maybeRegexMatch =
_ ->
LinkType (prepareUrlAndTitle rawUrl maybeTitle)
in
Just (
Match
Just
(Match
{ matchModel
| type_ = type_
, end = matchModel.end + regexMatchLength
}
)
)


withProtocol : String -> String
withProtocol url =
if String.startsWith "http" url then
url

else
"http://" ++ url


encodeUrl : String -> String
Expand All @@ -1638,6 +1824,67 @@ decodeUrlRegex =



-- ExtendedAutolink Tokens To Matches


isExtendedAutoLink : Token -> Bool
isExtendedAutoLink token =
case token.meaning of
ExtendedAutolink ->
True

EmailAutolink ->
True

_ ->
False


extendedAutolinkTTM : List Token -> List Token -> List Match -> References -> String -> List Match
extendedAutolinkTTM remaining tokens matches references rawText =
case remaining of
[] ->
emailAutolinkTTM (List.reverse tokens) [] matches references rawText

token :: tokensTail ->
case token.meaning of
ExtendedAutolink ->
case extendedAutolinkToMatch rawText token of
Just match ->
extendedAutolinkTTM tokensTail tokens (match :: matches) references rawText

Nothing ->
extendedAutolinkTTM tokensTail (token :: tokens) matches references rawText

_ ->
extendedAutolinkTTM tokensTail (token :: tokens) matches references rawText



-- EmailAutolink Tokens To Matches


emailAutolinkTTM : List Token -> List Token -> List Match -> References -> String -> List Match
emailAutolinkTTM remaining tokens matches references rawText =
case remaining of
[] ->
emphasisTTM (List.reverse tokens) [] matches references rawText

token :: tokensTail ->
case token.meaning of
EmailAutolink ->
case emailAutolinkToMatch rawText token of
Just match ->
emailAutolinkTTM tokensTail tokens (match :: matches) references rawText

Nothing ->
emailAutolinkTTM tokensTail (token :: tokens) matches references rawText

_ ->
emailAutolinkTTM tokensTail (token :: tokens) matches references rawText



-- EmphasisType Tokens To Matches


Expand Down
Loading