From e6ec97c09d9902e4babd3a113e6bad30bbd1b396 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 18 Apr 2026 12:12:13 +0200 Subject: [PATCH] feat(discovery/crawler): marktkalendarium.de parser --- backend/go.mod | 12 +- backend/go.sum | 89 ++++++++-- .../discovery/crawler/marktkalendarium.go | 157 ++++++++++++++++++ .../crawler/marktkalendarium_test.go | 64 +++++++ .../internal/domain/discovery/crawler/plz.go | 10 +- 5 files changed, 312 insertions(+), 20 deletions(-) create mode 100644 backend/internal/domain/discovery/crawler/marktkalendarium.go create mode 100644 backend/internal/domain/discovery/crawler/marktkalendarium_test.go diff --git a/backend/go.mod b/backend/go.mod index 11d6849..d8813ee 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,6 +3,7 @@ module marktvogt.de/backend go 1.26 require ( + github.com/PuerkitoBio/goquery v1.12.0 github.com/VikingOwl91/mistral-go-sdk v1.3.0 github.com/gin-gonic/gin v1.11.0 github.com/go-playground/validator/v10 v10.30.1 @@ -11,12 +12,13 @@ require ( github.com/jackc/pgx/v5 v5.8.0 github.com/pquerna/otp v1.5.0 github.com/valkey-io/valkey-go v1.0.72 - golang.org/x/crypto v0.48.0 + golang.org/x/crypto v0.49.0 golang.org/x/oauth2 v0.35.0 golang.org/x/time v0.14.0 ) require ( + github.com/andybalholm/cascadia v1.3.3 // indirect github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc // indirect github.com/bytedance/sonic v1.14.0 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect @@ -42,9 +44,9 @@ require ( github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect golang.org/x/arch v0.20.0 // indirect - golang.org/x/net v0.49.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/sys v0.41.0 // indirect - golang.org/x/text v0.34.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/text v0.35.0 // indirect google.golang.org/protobuf v1.36.9 // indirect ) diff --git a/backend/go.sum b/backend/go.sum index 782c9f8..c640361 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,5 +1,9 @@ +github.com/PuerkitoBio/goquery v1.12.0 h1:pAcL4g3WRXekcB9AU/y1mbKez2dbY2AajVhtkO8RIBo= +github.com/PuerkitoBio/goquery v1.12.0/go.mod h1:802ej+gV2y7bbIhOIoPY5sT183ZW0YFofScC4q/hIpQ= github.com/VikingOwl91/mistral-go-sdk v1.3.0 h1:OkTsodDE5lmdf7p2cwScqD2vIk8sScQ2IGk65dUjuz0= github.com/VikingOwl91/mistral-go-sdk v1.3.0/go.mod h1:f4emNtHUx2zSqY3V0LBz6lNI1jE6q/zh+SEU+/hJ0i4= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc h1:biVzkmvwrH8WK8raXaxBx6fRVTlJILwEwQGL1I/ByEI= github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/bytedance/sonic v1.14.0 h1:/OfKt8HFw0kh2rj8N0F6C/qPGRESq0BbaNZgcNXXzQQ= @@ -31,6 +35,7 @@ github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -84,27 +89,91 @@ github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= github.com/valkey-io/valkey-go v1.0.72 h1:iRWt1hJyOchcEgbHSkRY3aKkcBudxvMaVMsmxuYxuxE= github.com/valkey-io/valkey-go v1.0.72/go.mod h1:VGhZ6fs68Qrn2+OhH+6waZH27bjpgQOiLyUQyXuYK5k= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c= golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk= -golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= -golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= -golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= -golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= -golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= -golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw= google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/backend/internal/domain/discovery/crawler/marktkalendarium.go b/backend/internal/domain/discovery/crawler/marktkalendarium.go new file mode 100644 index 0000000..be8d05a --- /dev/null +++ b/backend/internal/domain/discovery/crawler/marktkalendarium.go @@ -0,0 +1,157 @@ +package crawler + +import ( + "bytes" + "context" + "fmt" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" +) + +const ( + landDeutschland = "Deutschland" + landOesterreich = "Oesterreich" + landSchweiz = "Schweiz" +) + +// MarktkalendariumSource scrapes www.marktkalendarium.de. Table rows: +// Von | Bis | Veranstaltung | Ort | Platz | Webseite | Veranstalter +type MarktkalendariumSource struct { + fetcher *Fetcher + urls []string +} + +func NewMarktkalendarium(f *Fetcher, urls []string) *MarktkalendariumSource { + return &MarktkalendariumSource{fetcher: f, urls: urls} +} + +func (s *MarktkalendariumSource) Name() string { return "marktkalendarium" } + +func (s *MarktkalendariumSource) Fetch(ctx context.Context) ([]RawEvent, error) { + var all []RawEvent + for i, url := range s.urls { + if i > 0 { + if err := sleepCtx(ctx, 2*time.Second); err != nil { + return all, err + } + } + body, err := s.fetcher.Get(ctx, url, "") + if err != nil { + return all, fmt.Errorf("marktkalendarium %s: %w", url, err) + } + events, err := parseMarktkalendarium(body, url) + if err != nil { + return all, fmt.Errorf("marktkalendarium parse %s: %w", url, err) + } + all = append(all, events...) + } + return all, nil +} + +func parseMarktkalendarium(data []byte, sourceURL string) ([]RawEvent, error) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + var events []RawEvent + doc.Find("table tr").Each(func(_ int, tr *goquery.Selection) { + cells := tr.Find("td") + if cells.Length() < 4 { + return // header row or layout + } + von := strings.TrimSpace(cells.Eq(0).Text()) + bis := strings.TrimSpace(cells.Eq(1).Text()) + name := strings.TrimSpace(cells.Eq(2).Text()) + ort := strings.TrimSpace(cells.Eq(3).Text()) + platz := "" + website := "" + organizer := "" + if cells.Length() >= 5 { + platz = strings.TrimSpace(cells.Eq(4).Text()) + } + if cells.Length() >= 6 { + // First anchor href, not the text — text may be "[Facebook link]". + href, _ := cells.Eq(5).Find("a").First().Attr("href") + website = strings.TrimSpace(href) + if website == "" { + website = strings.TrimSpace(cells.Eq(5).Text()) + } + } + if cells.Length() >= 7 { + organizer = strings.TrimSpace(cells.Eq(6).Text()) + } + + if name == "" || von == "" { + return + } + + start := parseDEDate(von) + end := parseDEDate(bis) + if start == nil { + return + } + + land, plz, city := splitMarktkalendariumOrt(ort) + + events = append(events, RawEvent{ + SourceName: "marktkalendarium", + SourceURL: sourceURL, + Name: name, + City: city, + PLZ: plz, + Land: land, + StartDate: start, + EndDate: end, + Website: website, + Venue: platz, + Organizer: organizer, + }) + }) + return events, nil +} + +// splitMarktkalendariumOrt parses "D-49186 Bad Iburg" into (land, PLZ, city). +// Returns ("", "", raw) when the prefix or PLZ doesn't match the expected shape. +func splitMarktkalendariumOrt(ort string) (land, plz, city string) { + idx := strings.Index(ort, "-") + if idx < 1 || idx > 3 { + return "", "", ort + } + prefix := ort[:idx] + rest := ort[idx+1:] + + switch prefix { + case "D": + land = landDeutschland + case "A": + land = landOesterreich + case "CH": + land = landSchweiz + default: + return "", "", ort + } + + sp := strings.IndexByte(rest, ' ') + if sp < 0 { + return land, "", strings.TrimSpace(rest) + } + plz = strings.TrimSpace(rest[:sp]) + city = strings.TrimSpace(rest[sp+1:]) + return land, plz, city +} + +// parseDEDate parses "3.4.2026" and returns nil on any failure. +func parseDEDate(s string) *time.Time { + s = strings.TrimSpace(s) + if s == "" { + return nil + } + for _, layout := range []string{"2.1.2006", "02.01.2006"} { + if t, err := time.Parse(layout, s); err == nil { + return &t + } + } + return nil +} diff --git a/backend/internal/domain/discovery/crawler/marktkalendarium_test.go b/backend/internal/domain/discovery/crawler/marktkalendarium_test.go new file mode 100644 index 0000000..1096511 --- /dev/null +++ b/backend/internal/domain/discovery/crawler/marktkalendarium_test.go @@ -0,0 +1,64 @@ +package crawler + +import ( + "os" + "testing" +) + +func TestMarktkalendariumParse(t *testing.T) { + data, err := os.ReadFile("testdata/marktkalendarium.html") + if err != nil { + t.Fatal(err) + } + + events, err := parseMarktkalendarium(data, "https://www.marktkalendarium.de/maerkte2026.php") + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(events) < 10 { + t.Fatalf("got %d events; expected at least 10", len(events)) + } + + // Basic shape checks on the first event — tighten once fixture is stable. + e := events[0] + if e.SourceName != "marktkalendarium" { + t.Errorf("SourceName = %q; want marktkalendarium", e.SourceName) + } + if e.Name == "" { + t.Error("Name empty") + } + if e.City == "" { + t.Error("City empty") + } + if e.Land != "Deutschland" && e.Land != "Oesterreich" && e.Land != "Schweiz" { + t.Errorf("Land = %q; want DACH country", e.Land) + } + if e.StartDate == nil { + t.Error("StartDate nil") + } + if e.SourceURL != "https://www.marktkalendarium.de/maerkte2026.php" { + t.Errorf("SourceURL = %q", e.SourceURL) + } +} + +func TestParseMarktkalendariumOrtField(t *testing.T) { + tests := []struct { + in string + wantLand, wantPLZ, wantCity string + }{ + {"D-49186 Bad Iburg", "Deutschland", "49186", "Bad Iburg"}, + {"A-1010 Wien", "Oesterreich", "1010", "Wien"}, + {"CH-8001 Zuerich", "Schweiz", "8001", "Zuerich"}, + {"D-94152 Neuhaus am Inn", "Deutschland", "94152", "Neuhaus am Inn"}, + {"garbage", "", "", "garbage"}, + } + for _, tc := range tests { + t.Run(tc.in, func(t *testing.T) { + land, plz, city := splitMarktkalendariumOrt(tc.in) + if land != tc.wantLand || plz != tc.wantPLZ || city != tc.wantCity { + t.Errorf("splitMarktkalendariumOrt(%q) = (%q, %q, %q); want (%q, %q, %q)", + tc.in, land, plz, city, tc.wantLand, tc.wantPLZ, tc.wantCity) + } + }) + } +} diff --git a/backend/internal/domain/discovery/crawler/plz.go b/backend/internal/domain/discovery/crawler/plz.go index b84c975..4bcc786 100644 --- a/backend/internal/domain/discovery/crawler/plz.go +++ b/backend/internal/domain/discovery/crawler/plz.go @@ -17,7 +17,7 @@ func InferLand(plz string) string { if !isAllDigits(plz) { return "" } - return "Deutschland" + return landDeutschland case 4: if !isAllDigits(plz) { return "" @@ -25,13 +25,13 @@ func InferLand(plz string) string { switch plz[0] { case '1': if plz >= "1200" && plz <= "1299" { - return "Schweiz" + return landSchweiz } - return "Oesterreich" + return landOesterreich case '3', '4', '6', '8', '9': - return "Schweiz" + return landSchweiz default: - return "Oesterreich" + return landOesterreich } } return ""