Sourcegraph: A large-scale code search engine in Go
Google I/O 2014
25 June 2014
Video at www.youtube.com/watch?v=-DpKaoPz8l8
Blog post at sourcegraph.com/blog/google-io-2014-building-sourcegraph-a-large-scale-code-search-engine-in-go
2Free and (mostly) open source: sourcegraph.com
3
Sourcegraph has been written in Go since day 1.
go/doc and go/types.
Sourcegraph was inspired by Go. What if every language had a godoc that:
We define our handlers with an error return:
func serveXYZ(w http.ResponseWriter, r *http.Request) error { ... }
Plus a simple wrapper function to make them http.Handler, check auth, etc.
Uses global vars instead of per-request context for DB, config, etc.
Routes with gorilla/mux.
Renders HTML:
func executeTemplate(req *http.Request, resp http.ResponseWriter, tmplName string,
status int, header http.Header, tmplData interface{}) error { ... }Writes JSON:
func writeJSON(w http.ResponseWriter, v interface{}) error { ... }The API client and data store should both implement the same interfaces:
type RepositoriesService interface { Get(name string) (*Repo, error) List() ([]*Repo, error) Search(opt *SearchOptions) ([]*Repo, error) // ... }
type repoStore struct{ db *db } func (s *repoStore) Get(name string) (*Repo, error) { var repo *Repo return repo, s.db.Select(&repo, "SELECT * FROM repo WHERE name=$1", name) }
type repoClient struct{ baseURL string } func (s *repoClient) Get(name string) (*Repo, error) { resp, err := http.Get(fmt.Sprintf("%s/api/repos/%s", s.baseURL, name)) if err != nil { return nil, err } defer resp.Body.Close() var repo Repo return &repo, json.NewDecoder(resp.Body).Decode(&repo) }
Frontend handler:
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"path/filepath"
"github.com/google/go-querystring/query"
"github.com/sqs/mux"
"github.com/sqs/schema"
)
// START SVC OMIT
type RepositoriesService interface {
Get(name string) (*Repo, error)
List() ([]*Repo, error)
Search(opt *SearchOptions) ([]*Repo, error)
// ...
}
// END SVC OMIT
type Repo struct {
Name string
CloneURL string
}
// START SEARCH OPTIONS OMIT
// options for method: Search(opt *SearchOptions) ([]*Repo, error) // HL
type SearchOptions struct {
Owner string
Language string
}
// END SEARCH OPTIONS OMIT
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START API GET OMIT
var repoDataStore RepositoriesService = &repoStore{}
func handleRepoGet(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["Name"]
repo, _ := repoDataStore.Get(name)
b, _ := json.Marshal(repo)
w.Write(b)
}
// END API GET OMIT
// START API LIST OMIT
func handleRepoList(w http.ResponseWriter, r *http.Request) {
repos, _ := repoDataStore.List()
b, _ := json.Marshal(repos)
w.Write(b)
}
// END API GET OMIT
// START API SEARCH OMIT
var d = schema.NewDecoder()
func handleRepoSearch(w http.ResponseWriter, r *http.Request) {
var opt SearchOptions
d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // HL
// ...
// END API SEARCH OMIT
repos, _ := repoDataStore.Search(&opt)
b, _ := json.Marshal(repos)
w.Write(b)
}
// START API ROUTER OMIT
const (
RepoGetRoute = "repo"
RepoListRoute = "repo.list"
RepoSearchRoute = "repo.search" // OMIT
)
func NewAPIRouter() *mux.Router {
m := mux.NewRouter()
// define the routes // HL
m.Path("/api/repos/search").Name(RepoSearchRoute) // OMIT
m.Path("/api/repos/{Name:.*}").Name(RepoGetRoute)
m.Path("/api/repos").Name(RepoListRoute)
return m
}
func init() {
m := NewAPIRouter()
// mount handlers // HL
m.Get(RepoGetRoute).HandlerFunc(handleRepoGet)
m.Get(RepoListRoute).HandlerFunc(handleRepoList)
m.Get(RepoSearchRoute).HandlerFunc(handleRepoSearch)
http.Handle("/api/", m)
}
// END API ROUTER OMIT
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
var repoAPIClient RepositoriesService = &repoClient{"http://localhost:7777"} func handleRepoPage(w http.ResponseWriter, r *http.Request) { name := mux.Vars(r)["Name"] repo, _ := repoAPIClient.Get(name) fmt.Fprintf(w, "<h1>%s</h1><p>Clone URL: %s</p>", repo.Name, repo.CloneURL) }
func handleRepoSearchPage(w http.ResponseWriter, r *http.Request) {
var opt SearchOptions
d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // HL
repos, _ := repoAPIClient.Search(&opt)
fmt.Fprintf(w, "<h1>Search: %+v</h1>", opt)
for _, repo := range repos {
fmt.Fprintf(w, `<p>%s (<a href="%s">%s</a>)</p>`, repo.Name, repo.CloneURL, repo.CloneURL)
}
}
func init() {
m := mux.NewRouter()
m.Path("/repos/search").HandlerFunc(handleRepoSearchPage)
m.Path("/repos/{Name:.*}").HandlerFunc(handleRepoPage)
http.Handle("/", m)
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START CLIENT OMIT
type repoClient struct{ baseURL string }
func (s *repoClient) Get(name string) (*Repo, error) {
resp, err := http.Get(fmt.Sprintf("%s/api/repos/%s", s.baseURL, name))
if err != nil {
return nil, err
}
defer resp.Body.Close()
var repo Repo
return &repo, json.NewDecoder(resp.Body).Decode(&repo)
}
// END CLIENT OMIT
// START CLIENT LIST OMIT
var apiRouter = NewAPIRouter()
func (s *repoClient) List() ([]*Repo, error) {
url, _ := apiRouter.Get(RepoListRoute).URL() // HL
resp, err := http.Get(s.baseURL + url.String())
if err != nil {
return nil, err
}
defer resp.Body.Close()
var repos []*Repo
return repos, json.NewDecoder(resp.Body).Decode(&repos)
}
// END CLIENT LIST OMIT
// START CLIENT SEARCH OMIT
func (s *repoClient) Search(opt *SearchOptions) ([]*Repo, error) {
url, _ := apiRouter.Get(RepoSearchRoute).URL()
q, _ := query.Values(opt) // encode querystring with github.com/google/go-querystring/query // HL
resp, err := http.Get(s.baseURL + url.String() + "?" + q.Encode())
// ...
if err != nil { // OMIT
return nil, err // OMIT
} // OMIT
defer resp.Body.Close() // OMIT
// OMIT
var repos []*Repo // OMIT
return repos, json.NewDecoder(resp.Body).Decode(&repos) // OMIT
}
// END CLIENT SEARCH OMIT
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START STORE OMIT
type repoStore struct{ db *db }
func (s *repoStore) Get(name string) (*Repo, error) {
var repo *Repo
return repo, s.db.Select(&repo, "SELECT * FROM repo WHERE name=$1", name)
}
// END STORE OMIT
func (s *repoStore) List() ([]*Repo, error) { return nil, nil }
func (s *repoStore) Search(opt *SearchOptions) ([]*Repo, error) {
log.Printf("repo search options: %+v", opt)
return []*Repo{{"myrepo", "git://github.com/foo/myrepo.git"}, {"mux", "git://github.com/gorilla/mux.git"}}, nil
}
type db struct{}
func (_ *db) Select(v interface{}, sql string, args ...interface{}) error {
if repo, ok := v.(**Repo); ok {
name, _ := args[0].(string)
*repo = &Repo{filepath.Base(name), "git://" + name + ".git"}
}
return nil
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
func main() {
bind := ":7777"
log.Printf("Listening on %s", bind)
log.Println(http.ListenAndServe(bind, nil))
}
API handler:
var repoStore RepositoriesService = &repoStore{dbh}
func serveRepository(w http.ResponseWriter, r *http.Request) error {
rp, err := repoStore.Get(mux.Vars(r)["Repo"])
if err != nil {
return repositoryError(err)
}
writeLastModifiedHeader(rp.UpdatedAt)
return writeJSON(w, rp)
}Separate the route definition from the mounting of handlers:
const ( RepoGetRoute = "repo" RepoListRoute = "repo.list" ) func NewAPIRouter() *mux.Router { m := mux.NewRouter() // define the routes m.Path("/api/repos/{Name:.*}").Name(RepoGetRoute) m.Path("/api/repos").Name(RepoListRoute) return m } func init() { m := NewAPIRouter() // mount handlers m.Get(RepoGetRoute).HandlerFunc(handleRepoGet) m.Get(RepoListRoute).HandlerFunc(handleRepoList) m.Get(RepoSearchRoute).HandlerFunc(handleRepoSearch) http.Handle("/api/", m) }
In your API client, generate the URL from the corresponding route definition:
var apiRouter = NewAPIRouter() func (s *repoClient) List() ([]*Repo, error) { url, _ := apiRouter.Get(RepoListRoute).URL() resp, err := http.Get(s.baseURL + url.String()) if err != nil { return nil, err } defer resp.Body.Close() var repos []*Repo return repos, json.NewDecoder(resp.Body).Decode(&repos) }
// options for method: Search(opt *SearchOptions) ([]*Repo, error) type SearchOptions struct { Owner string Language string }
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"path/filepath"
"github.com/google/go-querystring/query"
"github.com/sqs/mux"
"github.com/sqs/schema"
)
// START SVC OMIT
type RepositoriesService interface {
Get(name string) (*Repo, error)
List() ([]*Repo, error)
Search(opt *SearchOptions) ([]*Repo, error)
// ...
}
// END SVC OMIT
type Repo struct {
Name string
CloneURL string
}
// START SEARCH OPTIONS OMIT
// options for method: Search(opt *SearchOptions) ([]*Repo, error) // HL
type SearchOptions struct {
Owner string
Language string
}
// END SEARCH OPTIONS OMIT
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START API GET OMIT
var repoDataStore RepositoriesService = &repoStore{}
func handleRepoGet(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["Name"]
repo, _ := repoDataStore.Get(name)
b, _ := json.Marshal(repo)
w.Write(b)
}
// END API GET OMIT
// START API LIST OMIT
func handleRepoList(w http.ResponseWriter, r *http.Request) {
repos, _ := repoDataStore.List()
b, _ := json.Marshal(repos)
w.Write(b)
}
// END API GET OMIT
var d = schema.NewDecoder() func handleRepoSearch(w http.ResponseWriter, r *http.Request) { var opt SearchOptions d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // ...
repos, _ := repoDataStore.Search(&opt)
b, _ := json.Marshal(repos)
w.Write(b)
}
// START API ROUTER OMIT
const (
RepoGetRoute = "repo"
RepoListRoute = "repo.list"
RepoSearchRoute = "repo.search" // OMIT
)
func NewAPIRouter() *mux.Router {
m := mux.NewRouter()
// define the routes // HL
m.Path("/api/repos/search").Name(RepoSearchRoute) // OMIT
m.Path("/api/repos/{Name:.*}").Name(RepoGetRoute)
m.Path("/api/repos").Name(RepoListRoute)
return m
}
func init() {
m := NewAPIRouter()
// mount handlers // HL
m.Get(RepoGetRoute).HandlerFunc(handleRepoGet)
m.Get(RepoListRoute).HandlerFunc(handleRepoList)
m.Get(RepoSearchRoute).HandlerFunc(handleRepoSearch)
http.Handle("/api/", m)
}
// END API ROUTER OMIT
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START FRONTEND OMIT
var repoAPIClient RepositoriesService = &repoClient{"http://localhost:7777"}
func handleRepoPage(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["Name"]
repo, _ := repoAPIClient.Get(name) // HL
fmt.Fprintf(w, "<h1>%s</h1><p>Clone URL: %s</p>", repo.Name, repo.CloneURL)
}
// END FRONTEND OMIT
func handleRepoSearchPage(w http.ResponseWriter, r *http.Request) {
var opt SearchOptions
d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // HL
repos, _ := repoAPIClient.Search(&opt)
fmt.Fprintf(w, "<h1>Search: %+v</h1>", opt)
for _, repo := range repos {
fmt.Fprintf(w, `<p>%s (<a href="%s">%s</a>)</p>`, repo.Name, repo.CloneURL, repo.CloneURL)
}
}
func init() {
m := mux.NewRouter()
m.Path("/repos/search").HandlerFunc(handleRepoSearchPage)
m.Path("/repos/{Name:.*}").HandlerFunc(handleRepoPage)
http.Handle("/", m)
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START CLIENT OMIT
type repoClient struct{ baseURL string }
func (s *repoClient) Get(name string) (*Repo, error) {
resp, err := http.Get(fmt.Sprintf("%s/api/repos/%s", s.baseURL, name))
if err != nil {
return nil, err
}
defer resp.Body.Close()
var repo Repo
return &repo, json.NewDecoder(resp.Body).Decode(&repo)
}
// END CLIENT OMIT
// START CLIENT LIST OMIT
var apiRouter = NewAPIRouter()
func (s *repoClient) List() ([]*Repo, error) {
url, _ := apiRouter.Get(RepoListRoute).URL() // HL
resp, err := http.Get(s.baseURL + url.String())
if err != nil {
return nil, err
}
defer resp.Body.Close()
var repos []*Repo
return repos, json.NewDecoder(resp.Body).Decode(&repos)
}
// END CLIENT LIST OMIT
// START CLIENT SEARCH OMIT
func (s *repoClient) Search(opt *SearchOptions) ([]*Repo, error) {
url, _ := apiRouter.Get(RepoSearchRoute).URL()
q, _ := query.Values(opt) // encode querystring with github.com/google/go-querystring/query // HL
resp, err := http.Get(s.baseURL + url.String() + "?" + q.Encode())
// ...
if err != nil { // OMIT
return nil, err // OMIT
} // OMIT
defer resp.Body.Close() // OMIT
// OMIT
var repos []*Repo // OMIT
return repos, json.NewDecoder(resp.Body).Decode(&repos) // OMIT
}
// END CLIENT SEARCH OMIT
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START STORE OMIT
type repoStore struct{ db *db }
func (s *repoStore) Get(name string) (*Repo, error) {
var repo *Repo
return repo, s.db.Select(&repo, "SELECT * FROM repo WHERE name=$1", name)
}
// END STORE OMIT
func (s *repoStore) List() ([]*Repo, error) { return nil, nil }
func (s *repoStore) Search(opt *SearchOptions) ([]*Repo, error) {
log.Printf("repo search options: %+v", opt)
return []*Repo{{"myrepo", "git://github.com/foo/myrepo.git"}, {"mux", "git://github.com/gorilla/mux.git"}}, nil
}
type db struct{}
func (_ *db) Select(v interface{}, sql string, args ...interface{}) error {
if repo, ok := v.(**Repo); ok {
name, _ := args[0].(string)
*repo = &Repo{filepath.Base(name), "git://" + name + ".git"}
}
return nil
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
func main() {
bind := ":7777"
log.Printf("Listening on %s", bind)
log.Println(http.ListenAndServe(bind, nil))
}
func (s *repoClient) Search(opt *SearchOptions) ([]*Repo, error) { url, _ := apiRouter.Get(RepoSearchRoute).URL() q, _ := query.Values(opt) // encode querystring with github.com/google/go-querystring/query resp, err := http.Get(s.baseURL + url.String() + "?" + q.Encode()) // ... }
http.Handlers and API client http.Transport to track timing info using https://github.com/sourcegraph/appmonJust indexing the raw text of source files isn't good enough:
Sourcegraph analyzes code:
Requires:
package mypkg import "fmt" // Greet prints a friendly greeting. func Greet(salutation string) { fmt.Println(salutation, "to", Name) } // Name of whom you want to greet. var Name = "Milton"
~/src/github.com/sqs/mypkg $ srcgraph make
{"Definitions": [
{"Name": "Greet", "Kind": "func",
"File": "myfile.go", "DefStart": 30, "DefEnd": 68,
"Exported": true, "Type": "func(salutation string)"},
...
],
"References": [
{"DefRepo": "code.google.com/p/go", "DefUnit": "fmt", "DefPath": "Println",
"File": "myfile.go", "Start": 49, "End": 56}, ...
]
}go install github.com/sourcegraph/srcgraph
srcgraph make produces a standard Makefile describing the analysis
process.
The Makefile's recipes in turn call other srcgraph subcommands:
~/src/github.com/golang/groupcache$ srcgraph make -print-makefile
# Resolve dependencies of groupcache
.sourcegraph-data/groupcache@GoPackage_deps.json: *.go
srcgraph resolve-deps -json $^ > $@
# Dump definitions and references in groupcache
.sourcegraph-data/groupcache@GoPackage_graph.json: *.go
srcgraph graph -json $^ > $@
# Attribute each definition in groupcache to its authors
.sourcegraph-data/groupcache@GoPackage_authorship.json: .sourcegraph-data/groupcache@GoPackage_graph.json
srcgraph authorship -json $^ > $@type Analyzer interface { // Analyze a package to find all definitions (funcs, types, vars, etc.) and references Analyze(pkg string) ([]*Def, []*Ref, error) } type DependencyLister interface { // ListDependencies of pkg, which is either a file (e.g., package.json, setup.py) // or a dir (e.g., a Go package directory). ListDependencies(pkg string) ([]*Dep, error) }
We want to analyze:
var filename = "foo"; var address = "bar"; function openFile() {} function closeFile() {} function startServer() {}
First implement lang.Analyzer:
type Analyzer interface {
// Analyze a package to find all definitions (funcs, types, vars, etc.) and references
Analyze(pkg string) ([]*Def, []*Ref, error)
}type JSAnalyzer struct{} var jsdef = regexp.MustCompile(`(var|function) (\w+)`) func (_ JSAnalyzer) Analyze(file string) ([]*lang.Def, []*lang.Ref, error) { src, err := ioutil.ReadFile(file) if err != nil { log.Fatal(err) } var defs []*lang.Def for _, m := range jsdef.FindAllStringSubmatch(string(src), -1) { defs = append(defs, &lang.Def{Name: m[2], Type: m[1]}) } return defs, nil, nil }
package main
import (
"flag"
"fmt"
"log"
"github.com/sourcegraph/talks/google-io-2014/javascript"
)
func main() {
flag.Parse()
var file string
if flag.NArg() == 1 {
file = flag.Arg(0)
} else {
file = "google-io-2014/javascript/sample.js"
}
log.SetFlags(0)
log.Println("Analyzing", file, "for funcs & vars")
defs, _, err := (&javascript.JSAnalyzer{}).Analyze(file)
if err != nil {
log.Fatal(err)
}
fmt.Println("NAME \tTYPE")
fmt.Println("---- \t----")
for _, def := range defs {
fmt.Printf("%-10s\t%s\n", def.Name, def.Type)
}
}
(Real analyzers perform type checking and inference and are written in their
target language.)
We've defined the analyzers, but how does the program know to call it to analyze .js files?
Ideas:
http.Handle("/foo", myHandler)sql.Register("postgresql", pgdrv)crypto.RegisterHash(h, f)signal.Notify(c, syscall.SIGINT)"Don't ad-lib; just crib from the standard lib"
33github.com/dotcloud/docker/engine/engine.go:
type Handler func(*Job) Status
var globalHandlers = make(map[string]Handler)
func Register(name string, handler Handler) error {
_, exists := globalHandlers[name]
if exists {
return fmt.Errorf("Can't overwrite global handler for command %s", name)
}
globalHandlers[name] = handler
return nil
}
github.com/benmanns/goworker/workers.go
github.com/russross/meddler/meddler.go
github.com/rcrowley/go-metrics/registry.go
List of Go funcs named Register
A global variable + a Register function.
package lang var analyzers = make(map[string]Analyzer) func Register(language string, a Analyzer) { // maybe check for dupes or non-nil analyzers[language] = a }
In the language-specific packages, register the handler:
package javascript func init() { lang.Register("js", &JSAnalyzer{}) }
Then import language-specific packages for side effects in your program (to call init).
package main import _ "github.com/sourcegraph/talks/google-io-2014/javascript"
import (
"github.com/sourcegraph/talks/google-io-2014/lang"
)
func main() {
lang.PrintHandlers()
}
Remember that Register function that used a global var?
In other languages, that simple solution would be frowned upon.
In Go:
main().We upload the analysis output (language-independent JSON representing code's definitions) to our API.
And the web app will display it.
38Feedback and follow-ups?
Interested in using Sourcegraph to host docs & examples for your project?
Or joining us to build Sourcegraph?
Or t-shirts?