aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Mills <prologic@shortcircuit.net.au>2018-04-03 14:32:43 -0700
committerGitHub <noreply@github.com>2018-04-03 14:32:43 -0700
commitfe18a79dd2f7115b30d440a025a7f7f72486dd09 (patch)
treea2a5c9f24f90837013c6407077f687bf47de4934
parentRemoved build artifacts accidently committed to dist/ (diff)
downloadgopherproxy-fe18a79dd2f7115b30d440a025a7f7f72486dd09.tar.gz
gopherproxy-fe18a79dd2f7115b30d440a025a7f7f72486dd09.tar.bz2
gopherproxy-fe18a79dd2f7115b30d440a025a7f7f72486dd09.zip
Add support for robots.txt user agent control from web crawlers (#10)
-rw-r--r--cmd/gopherproxy/main.go10
-rw-r--r--gopherproxy.go62
-rw-r--r--robots.txt2
3 files changed, 61 insertions, 13 deletions
diff --git a/cmd/gopherproxy/main.go b/cmd/gopherproxy/main.go
index 9a7d1f9..84bde30 100644
--- a/cmd/gopherproxy/main.go
+++ b/cmd/gopherproxy/main.go
@@ -8,12 +8,16 @@ import (
8) 8)
9 9
10var ( 10var (
11 bind = flag.String("bind", "0.0.0.0:8000", "[int]:port to bind to") 11 // TODO: Allow config file and environment vars
12 uri = flag.String("uri", "floodgap.com", "<host>:[port] to proxy to") 12 // (opt -> env -> config -> default)
13 bind = flag.String("bind", "0.0.0.0:8000", "[int]:port to bind to")
14 robotsfile = flag.String("robots-file", "robots.txt", "robots.txt file")
15 uri = flag.String("uri", "floodgap.com", "<host>:[port] to proxy to")
13) 16)
14 17
15func main() { 18func main() {
16 flag.Parse() 19 flag.Parse()
17 20
18 log.Fatal(gopherproxy.ListenAndServe(*bind, *uri)) 21 // Use a config struct
22 log.Fatal(gopherproxy.ListenAndServe(*bind, *robotsfile, *uri))
19} 23}
diff --git a/gopherproxy.go b/gopherproxy.go
index dd416b2..42f67cf 100644
--- a/gopherproxy.go
+++ b/gopherproxy.go
@@ -10,6 +10,8 @@ import (
10 "net/url" 10 "net/url"
11 "strings" 11 "strings"
12 12
13 "github.com/temoto/robotstxt"
14
13 "github.com/prologic/go-gopher" 15 "github.com/prologic/go-gopher"
14) 16)
15 17
@@ -74,12 +76,21 @@ func renderDirectory(w http.ResponseWriter, tpl *template.Template, hostport str
74 }{title, out}) 76 }{title, out})
75} 77}
76 78
77// Handler returns a Handler that proxies requests 79// GopherHandler returns a Handler that proxies requests
78// to the specified Gopher server as denoated by the first argument 80// to the specified Gopher server as denoated by the first argument
79// to the request path and renders the content using the provided template. 81// to the request path and renders the content using the provided template.
80func Handler(tpl *template.Template, uri string) http.HandlerFunc { 82// The optional robots parameters points to a robotstxt.RobotsData struct
83// to test user agents against a configurable robotst.txt file.
84func GopherHandler(tpl *template.Template, robotsdata *robotstxt.RobotsData, uri string) http.HandlerFunc {
81 return func(w http.ResponseWriter, req *http.Request) { 85 return func(w http.ResponseWriter, req *http.Request) {
82 parts := strings.Split(strings.TrimPrefix(req.URL.Path, "/"), "/") 86 agent := req.UserAgent()
87 path := strings.TrimPrefix(req.URL.Path, "/")
88
89 if robotsdata != nil && !robotsdata.TestAgent(path, agent) {
90 log.Printf("UserAgent %s ignored robots.txt", agent)
91 }
92
93 parts := strings.Split(path, "/")
83 hostport := parts[0] 94 hostport := parts[0]
84 95
85 if len(hostport) == 0 { 96 if len(hostport) == 0 {
@@ -89,13 +100,11 @@ func Handler(tpl *template.Template, uri string) http.HandlerFunc {
89 100
90 var qs string 101 var qs string
91 102
92 path := strings.Join(parts[1:], "/")
93
94 if req.URL.RawQuery != "" { 103 if req.URL.RawQuery != "" {
95 qs = fmt.Sprintf("?%s", url.QueryEscape(req.URL.RawQuery)) 104 qs = fmt.Sprintf("?%s", url.QueryEscape(req.URL.RawQuery))
96 } 105 }
97 106
98 uri, err := url.QueryUnescape(path) 107 uri, err := url.QueryUnescape(strings.Join(parts[1:], "/"))
99 if err != nil { 108 if err != nil {
100 io.WriteString(w, fmt.Sprintf("<b>Error:</b><pre>%s</pre>", err)) 109 io.WriteString(w, fmt.Sprintf("<b>Error:</b><pre>%s</pre>", err))
101 return 110 return
@@ -126,13 +135,44 @@ func Handler(tpl *template.Template, uri string) http.HandlerFunc {
126 } 135 }
127} 136}
128 137
138// RobotsTxtHandler returns the contents of the robots.txt file
139// if configured and valid.
140func RobotsTxtHandler(robotstxtdata []byte) http.HandlerFunc {
141 return func(w http.ResponseWriter, req *http.Request) {
142 if robotstxtdata == nil {
143 http.Error(w, "Not Found", http.StatusNotFound)
144 return
145 }
146
147 w.Header().Set("Content-Type", "text/plain")
148 w.Write(robotstxtdata)
149 }
150}
151
129// ListenAndServe creates a listening HTTP server bound to 152// ListenAndServe creates a listening HTTP server bound to
130// the interface specified by bind and sets up a Gopher to HTTP 153// the interface specified by bind and sets up a Gopher to HTTP
131// proxy proxying requests as requested and by default will prozy 154// proxy proxying requests as requested and by default will prozy
132// to a Gopher server address specified by uri if no servers is 155// to a Gopher server address specified by uri if no servers is
133// specified by the request. 156// specified by the request. The robots argument is a pointer to
134func ListenAndServe(bind, uri string) error { 157// a robotstxt.RobotsData struct for testing user agents against
135 var tpl *template.Template 158// a configurable robots.txt file.
159func ListenAndServe(bind, robotsfile, uri string) error {
160 var (
161 tpl *template.Template
162 robotsdata *robotstxt.RobotsData
163 )
164
165 robotstxtdata, err := ioutil.ReadFile(robotsfile)
166 if err != nil {
167 log.Printf("error reading robots.txt: %s", err)
168 robotstxtdata = nil
169 } else {
170 robotsdata, err = robotstxt.FromBytes(robotstxtdata)
171 if err != nil {
172 log.Printf("error reading robots.txt: %s", err)
173 robotstxtdata = nil
174 }
175 }
136 176
137 tpldata, err := ioutil.ReadFile(".template") 177 tpldata, err := ioutil.ReadFile(".template")
138 if err == nil { 178 if err == nil {
@@ -144,6 +184,8 @@ func ListenAndServe(bind, uri string) error {
144 log.Fatal(err) 184 log.Fatal(err)
145 } 185 }
146 186
147 http.HandleFunc("/", Handler(tpl, uri)) 187 http.HandleFunc("/", GopherHandler(tpl, robotsdata, uri))
188 http.HandleFunc("/robots.txt", RobotsTxtHandler(robotstxtdata))
189
148 return http.ListenAndServe(bind, nil) 190 return http.ListenAndServe(bind, nil)
149} 191}
diff --git a/robots.txt b/robots.txt
new file mode 100644
index 0000000..1f53798
--- /dev/null
+++ b/robots.txt
@@ -0,0 +1,2 @@
1User-agent: *
2Disallow: /