Skip to content

Commit

Permalink
[feature] Use X-Robots-Tag headers to instruct scrapers/crawlers (#…
Browse files Browse the repository at this point in the history
…3737)

* [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers

* use switch for RobotsHeaders
  • Loading branch information
tsmethurst authored Feb 5, 2025
1 parent bfb81f5 commit baed591
Show file tree
Hide file tree
Showing 15 changed files with 311 additions and 142 deletions.
40 changes: 29 additions & 11 deletions cmd/gotosocial/action/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,8 @@ var Start action.GTSAction = func(ctx context.Context) error {
return fmt.Errorf("error creating main router: %s", err)
}

// Start preparing middleware stack.
// Start preparing global middleware
// stack (used for every request).
middlewares := make([]gin.HandlerFunc, 1)

// RequestID middleware must run before tracing!
Expand Down Expand Up @@ -499,13 +500,14 @@ var Start action.GTSAction = func(ctx context.Context) error {
metricsModule = api.NewMetrics() // Metrics endpoints
healthModule = api.NewHealth(dbService.Ready) // Health check endpoints
fileserverModule = api.NewFileserver(process) // fileserver endpoints
robotsModule = api.NewRobots() // robots.txt endpoint
wellKnownModule = api.NewWellKnown(process) // .well-known endpoints
nodeInfoModule = api.NewNodeInfo(process) // nodeinfo endpoint
activityPubModule = api.NewActivityPub(dbService, process) // ActivityPub endpoints
webModule = web.New(dbService, process) // web pages + user profiles + settings panels etc
)

// create required middleware
// Create per-route / per-grouping middlewares.
// rate limiting
rlLimit := config.GetAdvancedRateLimitRequests()
clLimit := middleware.RateLimit(rlLimit, config.GetAdvancedRateLimitExceptionsParsed()) // client api
Expand All @@ -518,10 +520,25 @@ var Start action.GTSAction = func(ctx context.Context) error {
retryAfter := config.GetAdvancedThrottlingRetryAfter()
clThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // client api
s2sThrottle := middleware.Throttle(cpuMultiplier, retryAfter)

// server-to-server (AP)
fsThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // fileserver / web templates / emojis
pkThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // throttle public key endpoint separately

// Robots http headers (x-robots-tag).
//
// robotsDisallowAll is used for client API + S2S endpoints
// that definitely should never be indexed by crawlers.
//
// robotsDisallowAIOnly is used for utility endpoints,
// fileserver, and for web endpoints that set their own
// additional robots directives in HTML meta tags.
//
// Other endpoints like .well-known and nodeinfo handle
// robots headers themselves based on configuration.
robotsDisallowAll := middleware.RobotsHeaders("")
robotsDisallowAIOnly := middleware.RobotsHeaders("aiOnly")

// Gzip middleware is applied to all endpoints except
// fileserver (compression too expensive for those),
// health (which really doesn't need compression), and
Expand All @@ -531,17 +548,18 @@ var Start action.GTSAction = func(ctx context.Context) error {

// these should be routed in order;
// apply throttling *after* rate limiting
authModule.Route(route, clLimit, clThrottle, gzip)
clientModule.Route(route, clLimit, clThrottle, gzip)
metricsModule.Route(route, clLimit, clThrottle)
healthModule.Route(route, clLimit, clThrottle)
fileserverModule.Route(route, fsMainLimit, fsThrottle)
fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle)
authModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip)
clientModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip)
metricsModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly)
healthModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly)
fileserverModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly)
fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle, robotsDisallowAIOnly)
robotsModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip)
wellKnownModule.Route(route, gzip, s2sLimit, s2sThrottle)
nodeInfoModule.Route(route, s2sLimit, s2sThrottle, gzip)
activityPubModule.Route(route, s2sLimit, s2sThrottle, gzip)
activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, gzip)
webModule.Route(route, fsMainLimit, fsThrottle, gzip)
activityPubModule.Route(route, s2sLimit, s2sThrottle, robotsDisallowAll, gzip)
activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, robotsDisallowAll, gzip)
webModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip)

// Finally start the main http server!
if err := route.Start(); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions cmd/gotosocial/action/testrig/testrig.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ var Start action.GTSAction = func(ctx context.Context) error {
metricsModule = api.NewMetrics() // Metrics endpoints
healthModule = api.NewHealth(state.DB.Ready) // Health check endpoints
fileserverModule = api.NewFileserver(processor) // fileserver endpoints
robotsModule = api.NewRobots() // robots.txt endpoint
wellKnownModule = api.NewWellKnown(processor) // .well-known endpoints
nodeInfoModule = api.NewNodeInfo(processor) // nodeinfo endpoint
activityPubModule = api.NewActivityPub(state.DB, processor) // ActivityPub endpoints
Expand All @@ -297,6 +298,7 @@ var Start action.GTSAction = func(ctx context.Context) error {
healthModule.Route(route)
fileserverModule.Route(route)
fileserverModule.RouteEmojis(route, instanceAccount.ID)
robotsModule.Route(route)
wellKnownModule.Route(route)
nodeInfoModule.Route(route)
activityPubModule.Route(route)
Expand Down
11 changes: 11 additions & 0 deletions internal/api/nodeinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package api
import (
"github.com/gin-gonic/gin"
"github.com/superseriousbusiness/gotosocial/internal/api/nodeinfo"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
"github.com/superseriousbusiness/gotosocial/internal/router"
Expand All @@ -43,6 +44,16 @@ func (w *NodeInfo) Route(r *router.Router, m ...gin.HandlerFunc) {
}),
)

// If instance is configured to serve instance stats
// faithfully at nodeinfo, we should allow robots to
// crawl nodeinfo endpoints in a limited capacity.
// In all other cases, disallow everything.
if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
nodeInfoGroup.Use(middleware.RobotsHeaders("allowSome"))
} else {
nodeInfoGroup.Use(middleware.RobotsHeaders(""))
}

w.nodeInfo.Route(nodeInfoGroup.Handle)
}

Expand Down
52 changes: 52 additions & 0 deletions internal/api/robots.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// GoToSocial
// Copyright (C) GoToSocial Authors [email protected]
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package api

import (
"github.com/gin-gonic/gin"
"github.com/superseriousbusiness/gotosocial/internal/api/robots"
"github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/router"
)

type Robots struct {
robots *robots.Module
}

func (rb *Robots) Route(r *router.Router, m ...gin.HandlerFunc) {
// Create a group so we can attach middlewares.
robotsGroup := r.AttachGroup("robots.txt")

// Use passed-in middlewares.
robotsGroup.Use(m...)

// Allow caching for 24 hrs.
// https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4
robotsGroup.Use(
middleware.CacheControl(middleware.CacheControlConfig{
Directives: []string{"public", "max-age=86400"},
Vary: []string{"Accept-Encoding"},
}),
)

rb.robots.Route(robotsGroup.Handle)
}

func NewRobots() *Robots {
return &Robots{}
}
57 changes: 57 additions & 0 deletions internal/api/robots/robots.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// GoToSocial
// Copyright (C) GoToSocial Authors [email protected]
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package robots

import (
"net/http"

"github.com/gin-gonic/gin"
apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
"github.com/superseriousbusiness/gotosocial/internal/config"
)

type Module struct{}

func New() *Module {
return &Module{}
}

func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
// Serve different robots.txt file depending on instance
// stats mode: Don't disallow scraping nodeinfo if admin
// has opted in to serving accurate stats there. In all
// other cases, disallow scraping nodeinfo.
var handler gin.HandlerFunc
if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
handler = m.robotsGETHandler
} else {
handler = m.robotsGETHandlerDisallowNodeInfo
}

// Attach handler at empty path as this
// is already grouped under /robots.txt.
attachHandler(http.MethodGet, "", handler)
}

func (m *Module) robotsGETHandler(c *gin.Context) {
c.String(http.StatusOK, apiutil.RobotsTxt)
}

func (m *Module) robotsGETHandlerDisallowNodeInfo(c *gin.Context) {
c.String(http.StatusOK, apiutil.RobotsTxtDisallowNodeInfo)
}
44 changes: 10 additions & 34 deletions internal/web/robots.go → internal/api/util/robots.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,17 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package web

import (
"net/http"

"github.com/gin-gonic/gin"
"github.com/superseriousbusiness/gotosocial/internal/config"
)
package util

// See:
//
// - https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
// - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag
// - https://www.rfc-editor.org/rfc/rfc9309.html
const (
robotsPath = "/robots.txt"
robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
robotsTxt = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go
RobotsDirectivesDisallow = "noindex, nofollow"
RobotsDirectivesAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard"
RobotsTxt = `# GoToSocial robots.txt -- to edit, see internal/api/util/robots.go
# More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro
# AI scrapers and the like.
Expand Down Expand Up @@ -127,31 +125,9 @@ Disallow: /about/suspended
# Webfinger endpoint.
Disallow: /.well-known/webfinger
`

robotsTxtNoNodeInfo = robotsTxt + `
RobotsTxtDisallowNodeInfo = RobotsTxt + `
# Disallow nodeinfo
Disallow: /.well-known/nodeinfo
Disallow: /nodeinfo/
`
)

// robotsGETHandler returns a decent robots.txt that prevents crawling
// the api, auth pages, settings pages, etc.
//
// More granular robots meta tags are then applied for web pages
// depending on user preferences (see internal/web).
func (m *Module) robotsGETHandler(c *gin.Context) {
// Allow caching for 24 hrs.
// https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4
c.Header("Cache-Control", "public, max-age=86400")

if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
// Serve robots.txt as-is
// without forbidding nodeinfo.
c.String(http.StatusOK, robotsTxt)
return
}

// Disallow scraping nodeinfo.
c.String(http.StatusOK, robotsTxtNoNodeInfo)
}
4 changes: 3 additions & 1 deletion internal/api/wellknown/hostmeta/hostmeta.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"net/http"

"github.com/gin-gonic/gin"
"github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
)

Expand All @@ -40,5 +41,6 @@ func New(processor *processing.Processor) *Module {
}

func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
attachHandler(http.MethodGet, HostMetaPath, m.HostMetaGETHandler)
// Attach handler, injecting robots http header middleware to disallow all.
attachHandler(http.MethodGet, HostMetaPath, middleware.RobotsHeaders(""), m.HostMetaGETHandler)
}
58 changes: 57 additions & 1 deletion internal/api/wellknown/nodeinfo/nodeinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ import (
"net/http"

"github.com/gin-gonic/gin"
apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
"github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
)

Expand All @@ -42,5 +46,57 @@ func New(processor *processing.Processor) *Module {
}

func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
attachHandler(http.MethodGet, NodeInfoWellKnownPath, m.NodeInfoWellKnownGETHandler)
// If instance is configured to serve instance stats
// faithfully at nodeinfo, we should allow robots to
// crawl nodeinfo endpoints in a limited capacity.
// In all other cases, disallow everything.
var robots gin.HandlerFunc
if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
robots = middleware.RobotsHeaders("allowSome")
} else {
robots = middleware.RobotsHeaders("")
}

// Attach handler, injecting robots http header middleware.
attachHandler(http.MethodGet, NodeInfoWellKnownPath, robots, m.NodeInfoWellKnownGETHandler)
}

// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet
//
// Returns a well-known response which redirects callers to `/nodeinfo/2.0`.
//
// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}`
// See: https://nodeinfo.diaspora.software/protocol.html
//
// ---
// tags:
// - .well-known
//
// produces:
// - application/json
//
// responses:
// '200':
// schema:
// "$ref": "#/definitions/wellKnownResponse"
func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) {
if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil {
apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1)
return
}

resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context())
if errWithCode != nil {
apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1)
return
}

// Encode JSON HTTP response.
apiutil.EncodeJSONResponse(
c.Writer,
c.Request,
http.StatusOK,
apiutil.AppJSON,
resp,
)
}
Loading

0 comments on commit baed591

Please sign in to comment.