mirror of
https://github.com/omniscale/magnacarto.git
synced 2025-02-22 23:24:09 +01:00
428 lines
12 KiB
Go
428 lines
12 KiB
Go
/*
|
|
The CartoCSS scanner is based on the github.com/gorilla/css CSS scanner.
|
|
|
|
Copyright (c) 2015, Omniscale
|
|
Copyright (c) 2013, Gorilla web toolkit
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without modification,
|
|
are permitted provided that the following conditions are met:
|
|
|
|
Redistributions of source code must retain the above copyright notice, this
|
|
list of conditions and the following disclaimer.
|
|
|
|
Redistributions in binary form must reproduce the above copyright notice, this
|
|
list of conditions and the following disclaimer in the documentation and/or
|
|
other materials provided with the distribution.
|
|
|
|
Neither the name of the {organization} nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
package mss
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// tokenType identifies the type of lexical tokens.
|
|
type tokenType int
|
|
|
|
// String returns a string representation of the token type.
|
|
func (t tokenType) String() string {
|
|
return tokenNames[t]
|
|
}
|
|
|
|
// token represents a token and the corresponding string.
|
|
type token struct {
|
|
t tokenType
|
|
value string
|
|
line int
|
|
column int
|
|
}
|
|
|
|
// String returns a string representation of the token.
|
|
func (t *token) String() string {
|
|
if len(t.value) > 10 {
|
|
return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
|
|
t.t, t.line, t.column, t.value)
|
|
}
|
|
return fmt.Sprintf("%s (line: %d, column: %d): %q",
|
|
t.t, t.line, t.column, t.value)
|
|
}
|
|
|
|
// The complete list of tokens in MSS.
|
|
const (
|
|
// Scanner flags
|
|
tokenError tokenType = iota
|
|
tokenEOF
|
|
// regular tokens
|
|
tokenIdent
|
|
tokenAtKeyword
|
|
tokenString
|
|
tokenHash
|
|
tokenAttachment
|
|
tokenClass
|
|
tokenInstance
|
|
tokenLBrace
|
|
tokenRBrace
|
|
tokenLBracket
|
|
tokenRBracket
|
|
tokenLParen
|
|
tokenRParen
|
|
tokenColon
|
|
tokenSemicolon
|
|
tokenComma
|
|
tokenPlus
|
|
tokenMinus
|
|
tokenMultiply
|
|
tokenDivide
|
|
tokenModulo
|
|
tokenComp
|
|
tokenNumber
|
|
tokenPercentage
|
|
tokenDimension
|
|
tokenURI
|
|
tokenUnicodeRange
|
|
tokenS
|
|
tokenComment
|
|
tokenFunction
|
|
tokenIncludes
|
|
tokenDashMatch
|
|
tokenPrefixMatch
|
|
tokenSuffixMatch
|
|
tokenSubstringMatch
|
|
tokenChar
|
|
tokenBOM
|
|
)
|
|
|
|
// tokenNames maps tokenType's to their names. Used for conversion to string.
|
|
var tokenNames = map[tokenType]string{
|
|
tokenError: "error",
|
|
tokenEOF: "EOF",
|
|
tokenIdent: "IDENT",
|
|
tokenAtKeyword: "ATKEYWORD",
|
|
tokenString: "STRING",
|
|
tokenHash: "HASH",
|
|
tokenAttachment: "ATTACHMENT",
|
|
tokenClass: "CLASS",
|
|
tokenInstance: "INSTANCE",
|
|
tokenLBrace: "LBRACE",
|
|
tokenRBrace: "RBRACE",
|
|
tokenLBracket: "LBRACKET",
|
|
tokenRBracket: "RBRACKET",
|
|
tokenLParen: "LPAREN",
|
|
tokenRParen: "RPAREN",
|
|
tokenColon: "COLON",
|
|
tokenSemicolon: "SEMICOLON",
|
|
tokenComma: "COMMA",
|
|
tokenPlus: "PLUS",
|
|
tokenMinus: "MINUS",
|
|
tokenMultiply: "MULTIPLY",
|
|
tokenDivide: "DIVIDE",
|
|
tokenModulo: "MODULO",
|
|
tokenComp: "COMP",
|
|
tokenNumber: "NUMBER",
|
|
tokenPercentage: "PERCENTAGE",
|
|
tokenDimension: "DIMENSION",
|
|
tokenURI: "URI",
|
|
tokenUnicodeRange: "UNICODE-RANGE",
|
|
tokenS: "S",
|
|
tokenComment: "COMMENT",
|
|
tokenFunction: "FUNCTION",
|
|
tokenIncludes: "INCLUDES",
|
|
tokenDashMatch: "DASHMATCH",
|
|
tokenPrefixMatch: "PREFIXMATCH",
|
|
tokenSuffixMatch: "SUFFIXMATCH",
|
|
tokenSubstringMatch: "SUBSTRINGMATCH",
|
|
tokenChar: "CHAR",
|
|
tokenBOM: "BOM",
|
|
}
|
|
|
|
// Macros and productions -----------------------------------------------------
|
|
// http://www.w3.org/TR/css3-syntax/#tokenization
|
|
|
|
var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
|
|
|
|
// macros maps macro names to patterns to be expanded.
|
|
var macros = map[string]string{
|
|
// must be escaped: `\.+*?()|[]{}^$`
|
|
"ident": `-?{nmstart}{nmchar}*`,
|
|
"name": `{nmchar}+`,
|
|
"nmstart": `[a-zA-Z_]|{nonascii}|{escape}`,
|
|
"nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
|
|
"unicode": `\\[0-9a-fA-F]{1,6}{wc}?`,
|
|
"escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
|
|
"nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
|
|
"num": `-?[0-9]*\.?[0-9]+`,
|
|
"string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
|
|
"stringchar": `{urlchar}|[ ]|\\{nl}`,
|
|
"urlchar": "[\u0009\u0021\u0023-\u0026\u0028-\u007E]|{nonascii}|{escape}",
|
|
"nl": `[\n\r\f]|\r\n`,
|
|
"w": `{wc}*`,
|
|
"wc": `[\t\n\f\r ]`,
|
|
}
|
|
|
|
// productions maps the list of tokens to patterns to be expanded.
|
|
var productions = map[tokenType]string{
|
|
// Unused regexps (matched using other methods) are commented out.
|
|
tokenIdent: `{ident}`,
|
|
tokenAtKeyword: `@{ident}`,
|
|
tokenString: `{string}`,
|
|
tokenHash: `#{name}`,
|
|
tokenAttachment: `::{name}`,
|
|
tokenClass: `\.{name}`,
|
|
tokenInstance: `{ident}/`,
|
|
tokenNumber: `{num}`,
|
|
tokenPercentage: `{num}%`,
|
|
tokenDimension: `{num}{ident}`,
|
|
tokenURI: `url\({w}(?:{string}|{urlchar}*){w}\)`,
|
|
tokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
|
|
tokenS: `{wc}+`,
|
|
tokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
|
|
tokenFunction: `{ident}\(`,
|
|
tokenComp: `>=|<=|>|<|!=|=~|=`,
|
|
//tokenIncludes: `~=`,
|
|
//tokenDashMatch: `\|=`,
|
|
//tokenPrefixMatch: `\^=`,
|
|
//tokenSuffixMatch: `\$=`,
|
|
//tokenSubstringMatch: `\*=`,
|
|
//tokenChar: `[^"']`,
|
|
//tokenBOM: "\uFEFF",
|
|
}
|
|
|
|
// matchers maps the list of tokens to compiled regular expressions.
|
|
//
|
|
// The map is filled on init() using the macros and productions defined in
|
|
// the CSS specification.
|
|
var matchers = map[tokenType]*regexp.Regexp{}
|
|
|
|
// matchOrder is the order to test regexps when first-char shortcuts
|
|
// can't be used.
|
|
var matchOrder = []tokenType{
|
|
tokenURI,
|
|
tokenFunction,
|
|
tokenUnicodeRange,
|
|
tokenInstance,
|
|
tokenIdent,
|
|
tokenDimension,
|
|
tokenPercentage,
|
|
tokenNumber,
|
|
tokenComp,
|
|
}
|
|
|
|
func init() {
|
|
// replace macros and compile regexps for productions.
|
|
replaceMacro := func(s string) string {
|
|
return "(?:" + macros[s[1:len(s)-1]] + ")"
|
|
}
|
|
for t, s := range productions {
|
|
for macroRegexp.MatchString(s) {
|
|
s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
|
|
}
|
|
matchers[t] = regexp.MustCompile("^(?:" + s + ")")
|
|
}
|
|
}
|
|
|
|
// Scanner --------------------------------------------------------------------
|
|
|
|
// New returns a new CSS scanner for the given input.
|
|
func newScanner(input string) *scanner {
|
|
// Normalize newlines.
|
|
input = strings.Replace(input, "\r\n", "\n", -1)
|
|
return &scanner{
|
|
input: input,
|
|
row: 1,
|
|
col: 1,
|
|
}
|
|
}
|
|
|
|
// Scanner scans an input and emits tokens following the CSS3 specification.
|
|
type scanner struct {
|
|
input string
|
|
pos int
|
|
row int
|
|
col int
|
|
err *token
|
|
}
|
|
|
|
// Next returns the next token from the input.
|
|
//
|
|
// At the end of the input the token type is tokenEOF.
|
|
//
|
|
// If the input can't be tokenized the token type is tokenError. This occurs
|
|
// in case of unclosed quotation marks or comments.
|
|
func (s *scanner) Next() *token {
|
|
if s.err != nil {
|
|
return s.err
|
|
}
|
|
if s.pos >= len(s.input) {
|
|
s.err = &token{tokenEOF, "", s.row, s.col}
|
|
return s.err
|
|
}
|
|
if s.pos == 0 {
|
|
// Test BOM only once, at the beginning of the file.
|
|
if strings.HasPrefix(s.input, "\uFEFF") {
|
|
return s.emitSimple(tokenBOM, "\uFEFF")
|
|
}
|
|
}
|
|
// There's a lot we can guess based on the first byte so we'll take a
|
|
// shortcut before testing multiple regexps.
|
|
input := s.input[s.pos:]
|
|
switch input[0] {
|
|
case '\t', '\n', '\f', '\r', ' ':
|
|
// Whitespace.
|
|
return s.emitToken(tokenS, matchers[tokenS].FindString(input))
|
|
case '.':
|
|
// Dot is too common to not have a quick check.
|
|
// We'll test if this is a Char; if it is followed by a number it is a
|
|
// dimension/percentage/number, and this will be matched later.
|
|
if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
|
|
if match := matchers[tokenClass].FindString(input); match != "" {
|
|
return s.emitSimple(tokenClass, match)
|
|
}
|
|
return s.emitSimple(tokenChar, ".")
|
|
}
|
|
case '#':
|
|
// Another common one: Hash or Char.
|
|
if match := matchers[tokenHash].FindString(input); match != "" {
|
|
return s.emitSimple(tokenHash, match)
|
|
}
|
|
return s.emitSimple(tokenChar, "#")
|
|
case '@':
|
|
// Another common one: AtKeyword or Char.
|
|
if match := matchers[tokenAtKeyword].FindString(input); match != "" {
|
|
return s.emitSimple(tokenAtKeyword, match)
|
|
}
|
|
return s.emitSimple(tokenChar, "@")
|
|
case ':':
|
|
// Another common one: Attachment or Char.
|
|
if match := matchers[tokenAttachment].FindString(input); match != "" {
|
|
return s.emitSimple(tokenAttachment, match)
|
|
}
|
|
return s.emitSimple(tokenColon, ":")
|
|
case '%':
|
|
return s.emitSimple(tokenModulo, string(input[0]))
|
|
// More common chars.
|
|
case '&':
|
|
return s.emitSimple(tokenChar, string(input[0]))
|
|
case ',':
|
|
return s.emitSimple(tokenComma, string(input[0]))
|
|
case ';':
|
|
return s.emitSimple(tokenSemicolon, string(input[0]))
|
|
case '(':
|
|
return s.emitSimple(tokenLParen, string(input[0]))
|
|
case ')':
|
|
return s.emitSimple(tokenRParen, string(input[0]))
|
|
case '[':
|
|
return s.emitSimple(tokenLBracket, string(input[0]))
|
|
case ']':
|
|
return s.emitSimple(tokenRBracket, string(input[0]))
|
|
case '{':
|
|
return s.emitSimple(tokenLBrace, string(input[0]))
|
|
case '}':
|
|
return s.emitSimple(tokenRBrace, string(input[0]))
|
|
case '+':
|
|
return s.emitSimple(tokenPlus, string(input[0]))
|
|
case '-':
|
|
if match := matchers[tokenNumber].FindString(input); match != "" {
|
|
return s.emitSimple(tokenNumber, match)
|
|
}
|
|
if match := matchers[tokenFunction].FindString(input); match != "" {
|
|
return s.emitSimple(tokenFunction, match)
|
|
}
|
|
|
|
return s.emitSimple(tokenMinus, string(input[0]))
|
|
case '*':
|
|
return s.emitSimple(tokenMultiply, string(input[0]))
|
|
// case '/': handled below
|
|
case '"', '\'':
|
|
// String or error.
|
|
match := matchers[tokenString].FindString(input)
|
|
if match != "" {
|
|
return s.emitToken(tokenString, match)
|
|
} else {
|
|
s.err = &token{tokenError, "unclosed quotation mark", s.row, s.col}
|
|
return s.err
|
|
}
|
|
case '/':
|
|
// Comment, error or Char.
|
|
if len(input) > 1 && input[1] == '*' {
|
|
match := matchers[tokenComment].FindString(input)
|
|
if match != "" {
|
|
return s.emitToken(tokenComment, match)
|
|
} else {
|
|
s.err = &token{tokenError, "unclosed comment", s.row, s.col}
|
|
return s.err
|
|
}
|
|
} else if len(input) > 1 && input[1] == '/' {
|
|
idx := strings.Index(input, "\n")
|
|
if idx < 0 {
|
|
// comment at end of document wihout new line
|
|
idx = len(input)
|
|
}
|
|
return s.emitToken(tokenComment, input[:idx])
|
|
}
|
|
return s.emitSimple(tokenDivide, "/")
|
|
}
|
|
// Test all regexps, in order.
|
|
for _, token := range matchOrder {
|
|
if match := matchers[token].FindString(input); match != "" {
|
|
return s.emitToken(token, match)
|
|
}
|
|
}
|
|
// We already handled unclosed quotation marks and comments,
|
|
// so this can only be a Char.
|
|
r, width := utf8.DecodeRuneInString(input)
|
|
token := &token{tokenChar, string(r), s.row, s.col}
|
|
s.col += width
|
|
s.pos += width
|
|
return token
|
|
}
|
|
|
|
// updatePosition updates input coordinates based on the consumed text.
|
|
func (s *scanner) updatePosition(text string) {
|
|
width := utf8.RuneCountInString(text)
|
|
lines := strings.Count(text, "\n")
|
|
s.row += lines
|
|
if lines == 0 {
|
|
s.col += width
|
|
} else {
|
|
s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
|
|
}
|
|
s.pos += len(text)
|
|
}
|
|
|
|
// emitToken returns a token for the string v and updates the scanner position.
|
|
func (s *scanner) emitToken(t tokenType, v string) *token {
|
|
token := &token{t, v, s.row, s.col}
|
|
s.updatePosition(v)
|
|
return token
|
|
}
|
|
|
|
// emitSimple returns a token for the string v and updates the scanner
|
|
// position in a simplified manner.
|
|
//
|
|
// The string is known to have only ASCII characters and to not have a newline.
|
|
func (s *scanner) emitSimple(t tokenType, v string) *token {
|
|
token := &token{t, v, s.row, s.col}
|
|
s.col += len(v)
|
|
s.pos += len(v)
|
|
return token
|
|
}
|