diff options
author | James Mills <prologic@shortcircuit.net.au> | 2018-04-03 14:32:43 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-03 14:32:43 -0700 |
commit | fe18a79dd2f7115b30d440a025a7f7f72486dd09 (patch) | |
tree | a2a5c9f24f90837013c6407077f687bf47de4934 | |
parent | Removed build artifacts accidently committed to dist/ (diff) | |
download | gopherproxy-fe18a79dd2f7115b30d440a025a7f7f72486dd09.tar.gz gopherproxy-fe18a79dd2f7115b30d440a025a7f7f72486dd09.tar.bz2 gopherproxy-fe18a79dd2f7115b30d440a025a7f7f72486dd09.zip |
Add support for robots.txt user agent control from web crawlers (#10)
-rw-r--r-- | cmd/gopherproxy/main.go | 10 | ||||
-rw-r--r-- | gopherproxy.go | 62 | ||||
-rw-r--r-- | robots.txt | 2 |
3 files changed, 61 insertions, 13 deletions
diff --git a/cmd/gopherproxy/main.go b/cmd/gopherproxy/main.go index 9a7d1f9..84bde30 100644 --- a/cmd/gopherproxy/main.go +++ b/cmd/gopherproxy/main.go | |||
@@ -8,12 +8,16 @@ import ( | |||
8 | ) | 8 | ) |
9 | 9 | ||
10 | var ( | 10 | var ( |
11 | bind = flag.String("bind", "0.0.0.0:8000", "[int]:port to bind to") | 11 | // TODO: Allow config file and environment vars |
12 | uri = flag.String("uri", "floodgap.com", "<host>:[port] to proxy to") | 12 | // (opt -> env -> config -> default) |
13 | bind = flag.String("bind", "0.0.0.0:8000", "[int]:port to bind to") | ||
14 | robotsfile = flag.String("robots-file", "robots.txt", "robots.txt file") | ||
15 | uri = flag.String("uri", "floodgap.com", "<host>:[port] to proxy to") | ||
13 | ) | 16 | ) |
14 | 17 | ||
15 | func main() { | 18 | func main() { |
16 | flag.Parse() | 19 | flag.Parse() |
17 | 20 | ||
18 | log.Fatal(gopherproxy.ListenAndServe(*bind, *uri)) | 21 | // Use a config struct |
22 | log.Fatal(gopherproxy.ListenAndServe(*bind, *robotsfile, *uri)) | ||
19 | } | 23 | } |
diff --git a/gopherproxy.go b/gopherproxy.go index dd416b2..42f67cf 100644 --- a/gopherproxy.go +++ b/gopherproxy.go | |||
@@ -10,6 +10,8 @@ import ( | |||
10 | "net/url" | 10 | "net/url" |
11 | "strings" | 11 | "strings" |
12 | 12 | ||
13 | "github.com/temoto/robotstxt" | ||
14 | |||
13 | "github.com/prologic/go-gopher" | 15 | "github.com/prologic/go-gopher" |
14 | ) | 16 | ) |
15 | 17 | ||
@@ -74,12 +76,21 @@ func renderDirectory(w http.ResponseWriter, tpl *template.Template, hostport str | |||
74 | }{title, out}) | 76 | }{title, out}) |
75 | } | 77 | } |
76 | 78 | ||
77 | // Handler returns a Handler that proxies requests | 79 | // GopherHandler returns a Handler that proxies requests |
78 | // to the specified Gopher server as denoated by the first argument | 80 | // to the specified Gopher server as denoated by the first argument |
79 | // to the request path and renders the content using the provided template. | 81 | // to the request path and renders the content using the provided template. |
80 | func Handler(tpl *template.Template, uri string) http.HandlerFunc { | 82 | // The optional robots parameters points to a robotstxt.RobotsData struct |
83 | // to test user agents against a configurable robotst.txt file. | ||
84 | func GopherHandler(tpl *template.Template, robotsdata *robotstxt.RobotsData, uri string) http.HandlerFunc { | ||
81 | return func(w http.ResponseWriter, req *http.Request) { | 85 | return func(w http.ResponseWriter, req *http.Request) { |
82 | parts := strings.Split(strings.TrimPrefix(req.URL.Path, "/"), "/") | 86 | agent := req.UserAgent() |
87 | path := strings.TrimPrefix(req.URL.Path, "/") | ||
88 | |||
89 | if robotsdata != nil && !robotsdata.TestAgent(path, agent) { | ||
90 | log.Printf("UserAgent %s ignored robots.txt", agent) | ||
91 | } | ||
92 | |||
93 | parts := strings.Split(path, "/") | ||
83 | hostport := parts[0] | 94 | hostport := parts[0] |
84 | 95 | ||
85 | if len(hostport) == 0 { | 96 | if len(hostport) == 0 { |
@@ -89,13 +100,11 @@ func Handler(tpl *template.Template, uri string) http.HandlerFunc { | |||
89 | 100 | ||
90 | var qs string | 101 | var qs string |
91 | 102 | ||
92 | path := strings.Join(parts[1:], "/") | ||
93 | |||
94 | if req.URL.RawQuery != "" { | 103 | if req.URL.RawQuery != "" { |
95 | qs = fmt.Sprintf("?%s", url.QueryEscape(req.URL.RawQuery)) | 104 | qs = fmt.Sprintf("?%s", url.QueryEscape(req.URL.RawQuery)) |
96 | } | 105 | } |
97 | 106 | ||
98 | uri, err := url.QueryUnescape(path) | 107 | uri, err := url.QueryUnescape(strings.Join(parts[1:], "/")) |
99 | if err != nil { | 108 | if err != nil { |
100 | io.WriteString(w, fmt.Sprintf("<b>Error:</b><pre>%s</pre>", err)) | 109 | io.WriteString(w, fmt.Sprintf("<b>Error:</b><pre>%s</pre>", err)) |
101 | return | 110 | return |
@@ -126,13 +135,44 @@ func Handler(tpl *template.Template, uri string) http.HandlerFunc { | |||
126 | } | 135 | } |
127 | } | 136 | } |
128 | 137 | ||
138 | // RobotsTxtHandler returns the contents of the robots.txt file | ||
139 | // if configured and valid. | ||
140 | func RobotsTxtHandler(robotstxtdata []byte) http.HandlerFunc { | ||
141 | return func(w http.ResponseWriter, req *http.Request) { | ||
142 | if robotstxtdata == nil { | ||
143 | http.Error(w, "Not Found", http.StatusNotFound) | ||
144 | return | ||
145 | } | ||
146 | |||
147 | w.Header().Set("Content-Type", "text/plain") | ||
148 | w.Write(robotstxtdata) | ||
149 | } | ||
150 | } | ||
151 | |||
129 | // ListenAndServe creates a listening HTTP server bound to | 152 | // ListenAndServe creates a listening HTTP server bound to |
130 | // the interface specified by bind and sets up a Gopher to HTTP | 153 | // the interface specified by bind and sets up a Gopher to HTTP |
131 | // proxy proxying requests as requested and by default will prozy | 154 | // proxy proxying requests as requested and by default will prozy |
132 | // to a Gopher server address specified by uri if no servers is | 155 | // to a Gopher server address specified by uri if no servers is |
133 | // specified by the request. | 156 | // specified by the request. The robots argument is a pointer to |
134 | func ListenAndServe(bind, uri string) error { | 157 | // a robotstxt.RobotsData struct for testing user agents against |
135 | var tpl *template.Template | 158 | // a configurable robots.txt file. |
159 | func ListenAndServe(bind, robotsfile, uri string) error { | ||
160 | var ( | ||
161 | tpl *template.Template | ||
162 | robotsdata *robotstxt.RobotsData | ||
163 | ) | ||
164 | |||
165 | robotstxtdata, err := ioutil.ReadFile(robotsfile) | ||
166 | if err != nil { | ||
167 | log.Printf("error reading robots.txt: %s", err) | ||
168 | robotstxtdata = nil | ||
169 | } else { | ||
170 | robotsdata, err = robotstxt.FromBytes(robotstxtdata) | ||
171 | if err != nil { | ||
172 | log.Printf("error reading robots.txt: %s", err) | ||
173 | robotstxtdata = nil | ||
174 | } | ||
175 | } | ||
136 | 176 | ||
137 | tpldata, err := ioutil.ReadFile(".template") | 177 | tpldata, err := ioutil.ReadFile(".template") |
138 | if err == nil { | 178 | if err == nil { |
@@ -144,6 +184,8 @@ func ListenAndServe(bind, uri string) error { | |||
144 | log.Fatal(err) | 184 | log.Fatal(err) |
145 | } | 185 | } |
146 | 186 | ||
147 | http.HandleFunc("/", Handler(tpl, uri)) | 187 | http.HandleFunc("/", GopherHandler(tpl, robotsdata, uri)) |
188 | http.HandleFunc("/robots.txt", RobotsTxtHandler(robotstxtdata)) | ||
189 | |||
148 | return http.ListenAndServe(bind, nil) | 190 | return http.ListenAndServe(bind, nil) |
149 | } | 191 | } |
diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..1f53798 --- /dev/null +++ b/robots.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | User-agent: * | ||
2 | Disallow: / | ||