Sourcegraph: A large-scale code search engine in Go

Google I/O 2014

25 June 2014

Quinn Slack

Sourcegraph

sourcegraph.com

Overview

Video at www.youtube.com/watch?v=-DpKaoPz8l8

Blog post at sourcegraph.com/blog/google-io-2014-building-sourcegraph-a-large-scale-code-search-engine-in-go

2

Sourcegraph: a code search engine written in Go

Free and (mostly) open source: sourcegraph.com

3

Demo

4

Why Go?

Sourcegraph has been written in Go since day 1.

Sourcegraph was inspired by Go. What if every language had a godoc that:

5

Building Sourcegraph with Go

6

Parts

7

Part 1: structure of a large Go web app

8

Implementation: avoiding complexity and repetition

9

No framework

We define our handlers with an error return:

func serveXYZ(w http.ResponseWriter, r *http.Request) error { ... }

Plus a simple wrapper function to make them http.Handler, check auth, etc.

Uses global vars instead of per-request context for DB, config, etc.

Routes with gorilla/mux.

Renders HTML:

func executeTemplate(req *http.Request, resp http.ResponseWriter, tmplName string,
                     status int, header http.Header, tmplData interface{}) error { ... }

Writes JSON:

func writeJSON(w http.ResponseWriter, v interface{}) error { ... }
10

Unify API client and data store interfaces

The API client and data store should both implement the same interfaces:

type RepositoriesService interface {
    Get(name string) (*Repo, error)
    List() ([]*Repo, error)
    Search(opt *SearchOptions) ([]*Repo, error)
    // ...
}
11

Data store methods

type repoStore struct{ db *db }

func (s *repoStore) Get(name string) (*Repo, error) {
    var repo *Repo
    return repo, s.db.Select(&repo, "SELECT * FROM repo WHERE name=$1", name)
}
12

API client methods

type repoClient struct{ baseURL string }

func (s *repoClient) Get(name string) (*Repo, error) {
    resp, err := http.Get(fmt.Sprintf("%s/api/repos/%s", s.baseURL, name))
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    var repo Repo
    return &repo, json.NewDecoder(resp.Body).Decode(&repo)
}
13

Simpler frontend and API http.Handlers

Frontend handler:

package main

import (
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"path/filepath"

	"github.com/google/go-querystring/query"
	"github.com/sqs/mux"
	"github.com/sqs/schema"
)

// START SVC OMIT
type RepositoriesService interface {
	Get(name string) (*Repo, error)
	List() ([]*Repo, error)
	Search(opt *SearchOptions) ([]*Repo, error)
	// ...
}

// END SVC OMIT

type Repo struct {
	Name     string
	CloneURL string
}

// START SEARCH OPTIONS OMIT
// options for method: Search(opt *SearchOptions) ([]*Repo, error) // HL
type SearchOptions struct {
	Owner    string
	Language string
}

// END SEARCH OPTIONS OMIT

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START API GET OMIT
var repoDataStore RepositoriesService = &repoStore{}

func handleRepoGet(w http.ResponseWriter, r *http.Request) {
	name := mux.Vars(r)["Name"]
	repo, _ := repoDataStore.Get(name)
	b, _ := json.Marshal(repo)
	w.Write(b)
}

// END API GET OMIT

// START API LIST OMIT

func handleRepoList(w http.ResponseWriter, r *http.Request) {
	repos, _ := repoDataStore.List()
	b, _ := json.Marshal(repos)
	w.Write(b)
}

// END API GET OMIT

// START API SEARCH OMIT
var d = schema.NewDecoder()

func handleRepoSearch(w http.ResponseWriter, r *http.Request) {
	var opt SearchOptions
	d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // HL
	// ...
	// END API SEARCH OMIT
	repos, _ := repoDataStore.Search(&opt)
	b, _ := json.Marshal(repos)
	w.Write(b)
}

// START API ROUTER OMIT
const (
	RepoGetRoute    = "repo"
	RepoListRoute   = "repo.list"
	RepoSearchRoute = "repo.search" // OMIT
)

func NewAPIRouter() *mux.Router {
	m := mux.NewRouter()
	// define the routes // HL
	m.Path("/api/repos/search").Name(RepoSearchRoute) // OMIT
	m.Path("/api/repos/{Name:.*}").Name(RepoGetRoute)
	m.Path("/api/repos").Name(RepoListRoute)
	return m
}

func init() {
	m := NewAPIRouter()
	// mount handlers // HL
	m.Get(RepoGetRoute).HandlerFunc(handleRepoGet)
	m.Get(RepoListRoute).HandlerFunc(handleRepoList)
	m.Get(RepoSearchRoute).HandlerFunc(handleRepoSearch)
	http.Handle("/api/", m)
}

// END API ROUTER OMIT

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

var repoAPIClient RepositoriesService = &repoClient{"http://localhost:7777"}

func handleRepoPage(w http.ResponseWriter, r *http.Request) {
    name := mux.Vars(r)["Name"]
    repo, _ := repoAPIClient.Get(name)
    fmt.Fprintf(w, "<h1>%s</h1><p>Clone URL: %s</p>", repo.Name, repo.CloneURL)
}

func handleRepoSearchPage(w http.ResponseWriter, r *http.Request) {
	var opt SearchOptions
	d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // HL
	repos, _ := repoAPIClient.Search(&opt)
	fmt.Fprintf(w, "<h1>Search: %+v</h1>", opt)
	for _, repo := range repos {
		fmt.Fprintf(w, `<p>%s (<a href="%s">%s</a>)</p>`, repo.Name, repo.CloneURL, repo.CloneURL)
	}
}

func init() {
	m := mux.NewRouter()
	m.Path("/repos/search").HandlerFunc(handleRepoSearchPage)
	m.Path("/repos/{Name:.*}").HandlerFunc(handleRepoPage)
	http.Handle("/", m)
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START CLIENT OMIT
type repoClient struct{ baseURL string }

func (s *repoClient) Get(name string) (*Repo, error) {
	resp, err := http.Get(fmt.Sprintf("%s/api/repos/%s", s.baseURL, name))
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	var repo Repo
	return &repo, json.NewDecoder(resp.Body).Decode(&repo)
}

// END CLIENT OMIT

// START CLIENT LIST OMIT

var apiRouter = NewAPIRouter()

func (s *repoClient) List() ([]*Repo, error) {
	url, _ := apiRouter.Get(RepoListRoute).URL() // HL
	resp, err := http.Get(s.baseURL + url.String())
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	var repos []*Repo
	return repos, json.NewDecoder(resp.Body).Decode(&repos)
}

// END CLIENT LIST OMIT

// START CLIENT SEARCH OMIT

func (s *repoClient) Search(opt *SearchOptions) ([]*Repo, error) {
	url, _ := apiRouter.Get(RepoSearchRoute).URL()
	q, _ := query.Values(opt) // encode querystring with github.com/google/go-querystring/query // HL
	resp, err := http.Get(s.baseURL + url.String() + "?" + q.Encode())
	// ...
	if err != nil { // OMIT
		return nil, err // OMIT
	} // OMIT
	defer resp.Body.Close() // OMIT
	// OMIT
	var repos []*Repo                                       // OMIT
	return repos, json.NewDecoder(resp.Body).Decode(&repos) // OMIT
}

// END CLIENT SEARCH OMIT

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START STORE OMIT
type repoStore struct{ db *db }

func (s *repoStore) Get(name string) (*Repo, error) {
	var repo *Repo
	return repo, s.db.Select(&repo, "SELECT * FROM repo WHERE name=$1", name)
}

// END STORE OMIT

func (s *repoStore) List() ([]*Repo, error) { return nil, nil }

func (s *repoStore) Search(opt *SearchOptions) ([]*Repo, error) {
	log.Printf("repo search options: %+v", opt)
	return []*Repo{{"myrepo", "git://github.com/foo/myrepo.git"}, {"mux", "git://github.com/gorilla/mux.git"}}, nil
}

type db struct{}

func (_ *db) Select(v interface{}, sql string, args ...interface{}) error {
	if repo, ok := v.(**Repo); ok {
		name, _ := args[0].(string)
		*repo = &Repo{filepath.Base(name), "git://" + name + ".git"}
	}
	return nil
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

func main() {
	bind := ":7777"
	log.Printf("Listening on %s", bind)
	log.Println(http.ListenAndServe(bind, nil))
}

API handler:

var repoStore RepositoriesService = &repoStore{dbh}

func serveRepository(w http.ResponseWriter, r *http.Request) error {
    rp, err := repoStore.Get(mux.Vars(r)["Repo"])
    if err != nil {
        return repositoryError(err)
    }
       writeLastModifiedHeader(rp.UpdatedAt)
    return writeJSON(w, rp)
}
14

Other API benefits

15

Unifying URL routing and generation

Separate the route definition from the mounting of handlers:

const (
    RepoGetRoute    = "repo"
    RepoListRoute   = "repo.list"
)

func NewAPIRouter() *mux.Router {
    m := mux.NewRouter()
    // define the routes
    m.Path("/api/repos/{Name:.*}").Name(RepoGetRoute)
    m.Path("/api/repos").Name(RepoListRoute)
    return m
}

func init() {
    m := NewAPIRouter()
    // mount handlers
    m.Get(RepoGetRoute).HandlerFunc(handleRepoGet)
    m.Get(RepoListRoute).HandlerFunc(handleRepoList)
    m.Get(RepoSearchRoute).HandlerFunc(handleRepoSearch)
    http.Handle("/api/", m)
}
16

Generating URLs using the router

In your API client, generate the URL from the corresponding route definition:

var apiRouter = NewAPIRouter()

func (s *repoClient) List() ([]*Repo, error) {
    url, _ := apiRouter.Get(RepoListRoute).URL()
    resp, err := http.Get(s.baseURL + url.String())
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    var repos []*Repo
    return repos, json.NewDecoder(resp.Body).Decode(&repos)
}
17

Sharing parameter structs

// options for method: Search(opt *SearchOptions) ([]*Repo, error)
type SearchOptions struct {
    Owner    string
    Language string
}
package main

import (
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"path/filepath"

	"github.com/google/go-querystring/query"
	"github.com/sqs/mux"
	"github.com/sqs/schema"
)

// START SVC OMIT
type RepositoriesService interface {
	Get(name string) (*Repo, error)
	List() ([]*Repo, error)
	Search(opt *SearchOptions) ([]*Repo, error)
	// ...
}

// END SVC OMIT

type Repo struct {
	Name     string
	CloneURL string
}

// START SEARCH OPTIONS OMIT
// options for method: Search(opt *SearchOptions) ([]*Repo, error) // HL
type SearchOptions struct {
	Owner    string
	Language string
}

// END SEARCH OPTIONS OMIT

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START API GET OMIT
var repoDataStore RepositoriesService = &repoStore{}

func handleRepoGet(w http.ResponseWriter, r *http.Request) {
	name := mux.Vars(r)["Name"]
	repo, _ := repoDataStore.Get(name)
	b, _ := json.Marshal(repo)
	w.Write(b)
}

// END API GET OMIT

// START API LIST OMIT

func handleRepoList(w http.ResponseWriter, r *http.Request) {
	repos, _ := repoDataStore.List()
	b, _ := json.Marshal(repos)
	w.Write(b)
}

// END API GET OMIT

var d = schema.NewDecoder()

func handleRepoSearch(w http.ResponseWriter, r *http.Request) {
    var opt SearchOptions
    d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema
    // ...
	repos, _ := repoDataStore.Search(&opt)
	b, _ := json.Marshal(repos)
	w.Write(b)
}

// START API ROUTER OMIT
const (
	RepoGetRoute    = "repo"
	RepoListRoute   = "repo.list"
	RepoSearchRoute = "repo.search" // OMIT
)

func NewAPIRouter() *mux.Router {
	m := mux.NewRouter()
	// define the routes // HL
	m.Path("/api/repos/search").Name(RepoSearchRoute) // OMIT
	m.Path("/api/repos/{Name:.*}").Name(RepoGetRoute)
	m.Path("/api/repos").Name(RepoListRoute)
	return m
}

func init() {
	m := NewAPIRouter()
	// mount handlers // HL
	m.Get(RepoGetRoute).HandlerFunc(handleRepoGet)
	m.Get(RepoListRoute).HandlerFunc(handleRepoList)
	m.Get(RepoSearchRoute).HandlerFunc(handleRepoSearch)
	http.Handle("/api/", m)
}

// END API ROUTER OMIT

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START FRONTEND OMIT
var repoAPIClient RepositoriesService = &repoClient{"http://localhost:7777"}

func handleRepoPage(w http.ResponseWriter, r *http.Request) {
	name := mux.Vars(r)["Name"]
	repo, _ := repoAPIClient.Get(name) // HL
	fmt.Fprintf(w, "<h1>%s</h1><p>Clone URL: %s</p>", repo.Name, repo.CloneURL)
}

// END FRONTEND OMIT

func handleRepoSearchPage(w http.ResponseWriter, r *http.Request) {
	var opt SearchOptions
	d.Decode(&opt, r.URL.Query()) // decode querystring with github.com/gorilla/schema // HL
	repos, _ := repoAPIClient.Search(&opt)
	fmt.Fprintf(w, "<h1>Search: %+v</h1>", opt)
	for _, repo := range repos {
		fmt.Fprintf(w, `<p>%s (<a href="%s">%s</a>)</p>`, repo.Name, repo.CloneURL, repo.CloneURL)
	}
}

func init() {
	m := mux.NewRouter()
	m.Path("/repos/search").HandlerFunc(handleRepoSearchPage)
	m.Path("/repos/{Name:.*}").HandlerFunc(handleRepoPage)
	http.Handle("/", m)
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START CLIENT OMIT
type repoClient struct{ baseURL string }

func (s *repoClient) Get(name string) (*Repo, error) {
	resp, err := http.Get(fmt.Sprintf("%s/api/repos/%s", s.baseURL, name))
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	var repo Repo
	return &repo, json.NewDecoder(resp.Body).Decode(&repo)
}

// END CLIENT OMIT

// START CLIENT LIST OMIT

var apiRouter = NewAPIRouter()

func (s *repoClient) List() ([]*Repo, error) {
	url, _ := apiRouter.Get(RepoListRoute).URL() // HL
	resp, err := http.Get(s.baseURL + url.String())
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	var repos []*Repo
	return repos, json.NewDecoder(resp.Body).Decode(&repos)
}

// END CLIENT LIST OMIT

// START CLIENT SEARCH OMIT

func (s *repoClient) Search(opt *SearchOptions) ([]*Repo, error) {
	url, _ := apiRouter.Get(RepoSearchRoute).URL()
	q, _ := query.Values(opt) // encode querystring with github.com/google/go-querystring/query // HL
	resp, err := http.Get(s.baseURL + url.String() + "?" + q.Encode())
	// ...
	if err != nil { // OMIT
		return nil, err // OMIT
	} // OMIT
	defer resp.Body.Close() // OMIT
	// OMIT
	var repos []*Repo                                       // OMIT
	return repos, json.NewDecoder(resp.Body).Decode(&repos) // OMIT
}

// END CLIENT SEARCH OMIT

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// START STORE OMIT
type repoStore struct{ db *db }

func (s *repoStore) Get(name string) (*Repo, error) {
	var repo *Repo
	return repo, s.db.Select(&repo, "SELECT * FROM repo WHERE name=$1", name)
}

// END STORE OMIT

func (s *repoStore) List() ([]*Repo, error) { return nil, nil }

func (s *repoStore) Search(opt *SearchOptions) ([]*Repo, error) {
	log.Printf("repo search options: %+v", opt)
	return []*Repo{{"myrepo", "git://github.com/foo/myrepo.git"}, {"mux", "git://github.com/gorilla/mux.git"}}, nil
}

type db struct{}

func (_ *db) Select(v interface{}, sql string, args ...interface{}) error {
	if repo, ok := v.(**Repo); ok {
		name, _ := args[0].(string)
		*repo = &Repo{filepath.Base(name), "git://" + name + ".git"}
	}
	return nil
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

func main() {
	bind := ":7777"
	log.Printf("Listening on %s", bind)
	log.Println(http.ListenAndServe(bind, nil))
}
func (s *repoClient) Search(opt *SearchOptions) ([]*Repo, error) {
    url, _ := apiRouter.Get(RepoSearchRoute).URL()
    q, _ := query.Values(opt) // encode querystring with github.com/google/go-querystring/query
    resp, err := http.Get(s.baseURL + url.String() + "?" + q.Encode())
    // ...
}
18

Other things we did

19

Part 2: a standalone CLI tool that builds and analyzes projects' code

20

Why code analysis?

Just indexing the raw text of source files isn't good enough:

21

What is code analysis?

Sourcegraph analyzes code:

Requires:

22

Analysis output

package mypkg

import "fmt"

// Greet prints a friendly greeting.
func Greet(salutation string) {
    fmt.Println(salutation, "to", Name)
}

// Name of whom you want to greet.
var Name = "Milton"
~/src/github.com/sqs/mypkg $ srcgraph make
{"Definitions": [
   {"Name": "Greet", "Kind": "func",
    "File": "myfile.go", "DefStart": 30, "DefEnd": 68,
    "Exported": true, "Type": "func(salutation string)"},
   ...
 ],
 "References": [
   {"DefRepo": "code.google.com/p/go", "DefUnit": "fmt", "DefPath": "Println",
    "File": "myfile.go", "Start": 49, "End": 56}, ...
 ]
}
23

Go helps make our CLI tool:

24

Easy to distribute

25

Easy to compile

go install github.com/sourcegraph/srcgraph
26

Easy to compose

srcgraph make produces a standard Makefile describing the analysis
process.

The Makefile's recipes in turn call other srcgraph subcommands:

~/src/github.com/golang/groupcache$ srcgraph make -print-makefile

# Resolve dependencies of groupcache
.sourcegraph-data/groupcache@GoPackage_deps.json: *.go
    srcgraph resolve-deps -json $^ > $@

# Dump definitions and references in groupcache
.sourcegraph-data/groupcache@GoPackage_graph.json: *.go
    srcgraph graph -json $^ > $@

# Attribute each definition in groupcache to its authors
.sourcegraph-data/groupcache@GoPackage_authorship.json: .sourcegraph-data/groupcache@GoPackage_graph.json
    srcgraph authorship -json $^ > $@
27

So far, srcgraph make has:

28

Language-specific analysis

type Analyzer interface {
    // Analyze a package to find all definitions (funcs, types, vars, etc.) and references
    Analyze(pkg string) ([]*Def, []*Ref, error)
}

type DependencyLister interface {
    // ListDependencies of pkg, which is either a file (e.g., package.json, setup.py)
    // or a dir (e.g., a Go package directory).
    ListDependencies(pkg string) ([]*Dep, error)
}
29

Example: simple JavaScript analyzer

We want to analyze:

var filename = "foo";

var address = "bar";

function openFile() {}

function closeFile() {}

function startServer() {}

First implement lang.Analyzer:

type Analyzer interface {
    // Analyze a package to find all definitions (funcs, types, vars, etc.) and references
    Analyze(pkg string) ([]*Def, []*Ref, error)
}
30

Example: simple JavaScript analyzer

type JSAnalyzer struct{}

var jsdef = regexp.MustCompile(`(var|function) (\w+)`)

func (_ JSAnalyzer) Analyze(file string) ([]*lang.Def, []*lang.Ref, error) {
    src, err := ioutil.ReadFile(file)
    if err != nil {
        log.Fatal(err)
    }
    var defs []*lang.Def
    for _, m := range jsdef.FindAllStringSubmatch(string(src), -1) {
        defs = append(defs, &lang.Def{Name: m[2], Type: m[1]})
    }
    return defs, nil, nil
}
package main

import (
	"flag"
	"fmt"
	"log"

	"github.com/sourcegraph/talks/google-io-2014/javascript"
)

func main() {
	flag.Parse()
	var file string
	if flag.NArg() == 1 {
		file = flag.Arg(0)
	} else {
		file = "google-io-2014/javascript/sample.js"
	}
	log.SetFlags(0)
	log.Println("Analyzing", file, "for funcs & vars")
    defs, _, err := (&javascript.JSAnalyzer{}).Analyze(file)
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println("NAME      \tTYPE")
	fmt.Println("----      \t----")
	for _, def := range defs {
		fmt.Printf("%-10s\t%s\n", def.Name, def.Type)
	}
}

(Real analyzers perform type checking and inference and are written in their
target language.)

31

Now what?

We've defined the analyzers, but how does the program know to call it to analyze .js files?

Ideas:

32

"Where does the stdlib do something similar?"

"Don't ad-lib; just crib from the standard lib"

33

And in open source Go projects...

github.com/dotcloud/docker/engine/engine.go:

type Handler func(*Job) Status

var globalHandlers = make(map[string]Handler)

func Register(name string, handler Handler) error {
  _, exists := globalHandlers[name]
  if exists {
    return fmt.Errorf("Can't overwrite global handler for command %s", name)
  }
  globalHandlers[name] = handler
  return nil
}

github.com/benmanns/goworker/workers.go
github.com/russross/meddler/meddler.go
github.com/rcrowley/go-metrics/registry.go
List of Go funcs named Register

34

Solution: mimic sql.Register/http.Handle/crypto.RegisterHash

A global variable + a Register function.

package lang

var analyzers = make(map[string]Analyzer)

func Register(language string, a Analyzer) {
    // maybe check for dupes or non-nil
    analyzers[language] = a
}

In the language-specific packages, register the handler:

package javascript

func init() {
    lang.Register("js", &JSAnalyzer{})
}

Then import language-specific packages for side effects in your program (to call init).

package main
import _ "github.com/sourcegraph/talks/google-io-2014/javascript"

import (
	"github.com/sourcegraph/talks/google-io-2014/lang"
)

func main() {
	lang.PrintHandlers()
}
35

Taking simplicity for granted

Remember that Register function that used a global var?

In other languages, that simple solution would be frowned upon.

36

Go's design makes this simple solution OK

In Go:

37

Analysis complete!

We upload the analysis output (language-independent JSON representing code's definitions) to our API.

And the web app will display it.

38

Sourcegraph: A large-scale code search engine in Go

Feedback and follow-ups?

Interested in using Sourcegraph to host docs & examples for your project?

Or joining us to build Sourcegraph?

Or t-shirts?

39

Thank you

Quinn Slack

Sourcegraph

sourcegraph.com

Use the left and right arrow keys or click the left and right edges of the page to navigate between slides.
(Press 'H' or navigate to hide this message.)