mobilizon-event-importer/scrapers/jsonld/jsonld.go

131 lines
3.8 KiB
Go

package jsonld
import (
"encoding/json"
"fmt"
"io"
"regexp"
"time"
"git.nroo.de/norwin/mobilizon-event-importer/mobilizon"
"golang.org/x/net/html"
)
var ErrJsonLdNotFound = fmt.Errorf("json+ld event description not found")
type EventAttendanceMode string
const (
OfflineEventAttendanceMode EventAttendanceMode = "https://schema.org/OfflineEventAttendanceMode"
OnlineEventAttendanceMode EventAttendanceMode = "https://schema.org/OnlineEventAttendanceMode"
MixedEventAttendanceMode EventAttendanceMode = "https://schema.org/MixedEventAttendanceMode"
)
// FIXME: most json+ld values can be either scalars or arrays, depending on count.
// how the fuck do we map this to go types?!
type Event struct {
Name string `json:"name"`
Description string `json:"description"`
StartDate LdTime `json:"startDate"`
EndDate LdTime `json:"endDate"`
EventAttendanceMode EventAttendanceMode `json:"eventAttendanceMode"`
URL string `json:"url"`
ImageURL string `json:"image"`
Location Place `json:"location"`
// Performers []Perfomer `json:"performers"`
// Offers []Offer
}
type Place struct {
Name string `json:"name"`
Address Address `json:"address"`
URL string `json:"url"`
}
type Address struct {
Street string `json:"streetAddress"`
PostCode string `json:"postalCode"`
City string `json:"addressLocality"`
Country string `json:"addressCountry"`
}
// LdTime is a helper for unmarshalling json. This is needed, as the time format
// returned by facebook is not compatible with go's RFC3339:
// - 2022-04-04T22:00:00+0200
// + 2022-04-04T22:00:00+02:00
type LdTime struct{ time.Time }
func (t *LdTime) UnmarshalJSON(b []byte) (err error) {
b = b[1 : len(b)-1] // strip quotes
s := string(timeszoneColonFilter.ReplaceAll(b, []byte("$1:$2")))
t.Time, err = time.Parse(time.RFC3339, s)
return
}
var (
timeszoneColonFilter = regexp.MustCompile(`(?m)(.+\d\d)(\d\d)$`)
)
func (event Event) ToMobilizonEvent() *mobilizon.Event {
// TODO: swap physical for online address depending on event.EventAttendanceMode
return &mobilizon.Event{
Title: mobilizon.String(event.Name),
Description: mobilizon.String(event.Description),
BeginsOn: event.StartDate.Time,
EndsOn: event.EndDate.Time,
PhysicalAddress: mobilizon.Address{
Description: mobilizon.String(event.Location.Name),
Street: mobilizon.String(event.Location.Address.Street),
PostalCode: mobilizon.String(event.Location.Address.PostCode),
Locality: mobilizon.String(event.Location.Address.City),
Country: mobilizon.String(event.Location.Address.Country),
},
Picture: mobilizon.Media{
URL: mobilizon.String(event.ImageURL),
},
}
}
type ContentProcessor func([]byte) []byte
func FindLdJsonInHtml(htmlContent io.Reader, contentCallback ContentProcessor) (*Event, error) {
var jsonld []byte
tokenizer := html.NewTokenizer(htmlContent)
tokenizer.AllowCDATA(true)
TOKENIZED:
for {
elem := tokenizer.Next()
switch elem {
case html.StartTagToken:
name, hasAttributes := tokenizer.TagName()
if string(name) == "script" {
var k, v []byte
for hasAttributes {
k, v, hasAttributes = tokenizer.TagAttr()
if string(k) == "type" && string(v) == "application/ld+json" {
tokenizer.Next()
jsonld = tokenizer.Text()
break TOKENIZED
}
}
}
case html.ErrorToken:
if err := tokenizer.Err(); err != io.EOF {
return nil, err
}
break TOKENIZED
}
}
if jsonld == nil {
return nil, ErrJsonLdNotFound
}
if contentCallback != nil {
jsonld = contentCallback(jsonld)
}
ldEvent := Event{}
return &ldEvent, json.Unmarshal(jsonld, &ldEvent)
}