From fe18a79dd2f7115b30d440a025a7f7f72486dd09 Mon Sep 17 00:00:00 2001
From: James Mills <prologic@shortcircuit.net.au>
Date: Tue, 3 Apr 2018 14:32:43 -0700
Subject: Add support for robots.txt user agent control from web crawlers (#10)

---
 cmd/gopherproxy/main.go | 10 +++++---
 gopherproxy.go          | 62 +++++++++++++++++++++++++++++++++++++++++--------
 robots.txt              |  2 ++
 3 files changed, 61 insertions(+), 13 deletions(-)
 create mode 100644 robots.txt

diff --git a/cmd/gopherproxy/main.go b/cmd/gopherproxy/main.go
index 9a7d1f9..84bde30 100644
--- a/cmd/gopherproxy/main.go
+++ b/cmd/gopherproxy/main.go
@@ -8,12 +8,16 @@ import (
 )
 
 var (
-	bind = flag.String("bind", "0.0.0.0:8000", "[int]:port to bind to")
-	uri  = flag.String("uri", "floodgap.com", "<host>:[port] to proxy to")
+	// TODO: Allow config file and environment vars
+	//       (opt -> env -> config -> default)
+	bind       = flag.String("bind", "0.0.0.0:8000", "[int]:port to bind to")
+	robotsfile = flag.String("robots-file", "robots.txt", "robots.txt file")
+	uri        = flag.String("uri", "floodgap.com", "<host>:[port] to proxy to")
 )
 
 func main() {
 	flag.Parse()
 
-	log.Fatal(gopherproxy.ListenAndServe(*bind, *uri))
+	// Use a config struct
+	log.Fatal(gopherproxy.ListenAndServe(*bind, *robotsfile, *uri))
 }
diff --git a/gopherproxy.go b/gopherproxy.go
index dd416b2..42f67cf 100644
--- a/gopherproxy.go
+++ b/gopherproxy.go
@@ -10,6 +10,8 @@ import (
 	"net/url"
 	"strings"
 
+	"github.com/temoto/robotstxt"
+
 	"github.com/prologic/go-gopher"
 )
 
@@ -74,12 +76,21 @@ func renderDirectory(w http.ResponseWriter, tpl *template.Template, hostport str
 	}{title, out})
 }
 
-// Handler returns a Handler that proxies requests
+// GopherHandler returns a Handler that proxies requests
 // to the specified Gopher server as denoated by the first argument
 // to the request path and renders the content using the provided template.
-func Handler(tpl *template.Template, uri string) http.HandlerFunc {
+// The optional robots parameters points to a robotstxt.RobotsData struct
+// to test user agents against a configurable robotst.txt file.
+func GopherHandler(tpl *template.Template, robotsdata *robotstxt.RobotsData, uri string) http.HandlerFunc {
 	return func(w http.ResponseWriter, req *http.Request) {
-		parts := strings.Split(strings.TrimPrefix(req.URL.Path, "/"), "/")
+		agent := req.UserAgent()
+		path := strings.TrimPrefix(req.URL.Path, "/")
+
+		if robotsdata != nil && !robotsdata.TestAgent(path, agent) {
+			log.Printf("UserAgent %s ignored robots.txt", agent)
+		}
+
+		parts := strings.Split(path, "/")
 		hostport := parts[0]
 
 		if len(hostport) == 0 {
@@ -89,13 +100,11 @@ func Handler(tpl *template.Template, uri string) http.HandlerFunc {
 
 		var qs string
 
-		path := strings.Join(parts[1:], "/")
-
 		if req.URL.RawQuery != "" {
 			qs = fmt.Sprintf("?%s", url.QueryEscape(req.URL.RawQuery))
 		}
 
-		uri, err := url.QueryUnescape(path)
+		uri, err := url.QueryUnescape(strings.Join(parts[1:], "/"))
 		if err != nil {
 			io.WriteString(w, fmt.Sprintf("<b>Error:</b><pre>%s</pre>", err))
 			return
@@ -126,13 +135,44 @@ func Handler(tpl *template.Template, uri string) http.HandlerFunc {
 	}
 }
 
+// RobotsTxtHandler returns the contents of the robots.txt file
+// if configured and valid.
+func RobotsTxtHandler(robotstxtdata []byte) http.HandlerFunc {
+	return func(w http.ResponseWriter, req *http.Request) {
+		if robotstxtdata == nil {
+			http.Error(w, "Not Found", http.StatusNotFound)
+			return
+		}
+
+		w.Header().Set("Content-Type", "text/plain")
+		w.Write(robotstxtdata)
+	}
+}
+
 // ListenAndServe creates a listening HTTP server bound to
 // the interface specified by bind and sets up a Gopher to HTTP
 // proxy proxying requests as requested and by default will prozy
 // to a Gopher server address specified by uri if no servers is
-// specified by the request.
-func ListenAndServe(bind, uri string) error {
-	var tpl *template.Template
+// specified by the request. The robots argument is a pointer to
+// a robotstxt.RobotsData struct for testing user agents against
+// a configurable robots.txt file.
+func ListenAndServe(bind, robotsfile, uri string) error {
+	var (
+		tpl        *template.Template
+		robotsdata *robotstxt.RobotsData
+	)
+
+	robotstxtdata, err := ioutil.ReadFile(robotsfile)
+	if err != nil {
+		log.Printf("error reading robots.txt: %s", err)
+		robotstxtdata = nil
+	} else {
+		robotsdata, err = robotstxt.FromBytes(robotstxtdata)
+		if err != nil {
+			log.Printf("error reading robots.txt: %s", err)
+			robotstxtdata = nil
+		}
+	}
 
 	tpldata, err := ioutil.ReadFile(".template")
 	if err == nil {
@@ -144,6 +184,8 @@ func ListenAndServe(bind, uri string) error {
 		log.Fatal(err)
 	}
 
-	http.HandleFunc("/", Handler(tpl, uri))
+	http.HandleFunc("/", GopherHandler(tpl, robotsdata, uri))
+	http.HandleFunc("/robots.txt", RobotsTxtHandler(robotstxtdata))
+
 	return http.ListenAndServe(bind, nil)
 }
diff --git a/robots.txt b/robots.txt
new file mode 100644
index 0000000..1f53798
--- /dev/null
+++ b/robots.txt
@@ -0,0 +1,2 @@
+User-agent: *
+Disallow: /
-- 
cgit v1.2.3-70-g09d2