mirror of https://git.sr.ht/~rjarry/aerc
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
130 lines
2.6 KiB
Go
130 lines
2.6 KiB
Go
package parse
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"regexp"
|
|
"sort"
|
|
)
|
|
|
|
// Partial regexp to match the beginning of URLs and email addresses.
|
|
// The remainder of the matched URLs/emails is parsed manually.
|
|
var urlRe = regexp.MustCompile(
|
|
`([a-z]{2,8})://` + // URL start
|
|
`|` + // or
|
|
`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
|
|
)
|
|
|
|
// HttpLinks searches a reader for a http link and returns a copy of the
|
|
// reader and a slice with links.
|
|
func HttpLinks(r io.Reader) (io.Reader, []string) {
|
|
buf, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return r, nil
|
|
}
|
|
|
|
links := make(map[string]bool)
|
|
b := buf
|
|
match := urlRe.FindSubmatchIndex(b)
|
|
for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
|
|
// Regular expressions do not really cut it here and we
|
|
// need to detect opening/closing braces to handle
|
|
// markdown link syntax.
|
|
var paren, bracket, ltgt, scheme int
|
|
var emitUrl bool
|
|
i, j := match[0], match[1]
|
|
b = b[i:]
|
|
scheme = j - i
|
|
j = scheme
|
|
|
|
// "inline" email without a mailto: prefix - add some extra checks for those
|
|
inlineEmail := len(match) > 4 && match[2] == -1 && match[4] == -1
|
|
|
|
for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
|
|
switch b[j] {
|
|
case '[':
|
|
bracket++
|
|
j++
|
|
case '(':
|
|
paren++
|
|
j++
|
|
case '<':
|
|
ltgt++
|
|
j++
|
|
case ']':
|
|
bracket--
|
|
if bracket < 0 {
|
|
emitUrl = true
|
|
} else {
|
|
j++
|
|
}
|
|
case ')':
|
|
paren--
|
|
if paren < 0 {
|
|
emitUrl = true
|
|
} else {
|
|
j++
|
|
}
|
|
case '>':
|
|
ltgt--
|
|
if ltgt < 0 {
|
|
emitUrl = true
|
|
} else {
|
|
j++
|
|
}
|
|
case '&':
|
|
if inlineEmail {
|
|
emitUrl = true
|
|
} else {
|
|
j++
|
|
}
|
|
default:
|
|
j++
|
|
}
|
|
|
|
// we don't want those in inline emails
|
|
if inlineEmail && (paren > 0 || ltgt > 0 || bracket > 0) {
|
|
j--
|
|
emitUrl = true
|
|
}
|
|
}
|
|
|
|
// Heuristic to remove trailing characters that are
|
|
// valid URL characters, but typically not at the end of
|
|
// the URL
|
|
for trim := true; trim && j > 0; {
|
|
switch b[j-1] {
|
|
case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
|
|
j--
|
|
default:
|
|
trim = false
|
|
}
|
|
}
|
|
if j == scheme {
|
|
// Only an URL scheme, ignore.
|
|
b = b[j:]
|
|
continue
|
|
}
|
|
url := string(b[:j])
|
|
if inlineEmail {
|
|
// Email address with missing mailto: scheme. Add it.
|
|
url = "mailto:" + url
|
|
}
|
|
links[url] = true
|
|
b = b[j:]
|
|
}
|
|
|
|
results := make([]string, 0, len(links))
|
|
for link := range links {
|
|
results = append(results, link)
|
|
}
|
|
sort.Strings(results)
|
|
|
|
return bytes.NewReader(buf), results
|
|
}
|
|
|
|
var urichars = []byte(
|
|
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
|
|
"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
|
|
)
|