syzkaller/syz-manager/manager.go

1312 lines
36 KiB
Go

// Copyright 2015 syzkaller project authors. All rights reserved.
// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
package main
import (
"bytes"
"encoding/json"
"flag"
"fmt"
"math/rand"
"net"
"os"
"os/exec"
"path/filepath"
"sync"
"sync/atomic"
"time"
"github.com/google/syzkaller/dashboard/dashapi"
"github.com/google/syzkaller/pkg/cover"
"github.com/google/syzkaller/pkg/csource"
"github.com/google/syzkaller/pkg/db"
"github.com/google/syzkaller/pkg/gce"
"github.com/google/syzkaller/pkg/hash"
. "github.com/google/syzkaller/pkg/log"
"github.com/google/syzkaller/pkg/osutil"
"github.com/google/syzkaller/pkg/report"
"github.com/google/syzkaller/pkg/repro"
. "github.com/google/syzkaller/pkg/rpctype"
"github.com/google/syzkaller/pkg/signal"
"github.com/google/syzkaller/prog"
"github.com/google/syzkaller/sys"
"github.com/google/syzkaller/syz-manager/mgrconfig"
"github.com/google/syzkaller/vm"
)
var (
flagConfig = flag.String("config", "", "configuration file")
flagDebug = flag.Bool("debug", false, "dump all VM output to console")
flagBench = flag.String("bench", "", "write execution statistics into this file periodically")
)
type Manager struct {
cfg *mgrconfig.Config
vmPool *vm.Pool
target *prog.Target
reporter report.Reporter
crashdir string
port int
corpusDB *db.DB
startTime time.Time
firstConnect time.Time
lastPrioCalc time.Time
fuzzingTime time.Duration
stats map[string]uint64
crashTypes map[string]bool
vmStop chan bool
vmChecked bool
fresh bool
numFuzzing uint32
numReproducing uint32
dash *dashapi.Dashboard
mu sync.Mutex
phase int
enabledSyscalls []int
enabledCalls []string // as determined by fuzzer
candidates []RPCCandidate // untriaged inputs from corpus and hub
disabledHashes map[string]struct{}
corpus map[string]RPCInput
corpusCover cover.Cover
corpusSignal signal.Signal
maxSignal signal.Signal
prios [][]float32
newRepros [][]byte
fuzzers map[string]*Fuzzer
hub *RPCClient
hubCorpus map[hash.Sig]bool
needMoreRepros chan chan bool
hubReproQueue chan *Crash
reproRequest chan chan map[string]bool
// For checking that files that we are using are not changing under us.
// Maps file name to modification time.
usedFiles map[string]time.Time
}
const (
// Just started, nothing done yet.
phaseInit = iota
// Triaged all inputs from corpus.
// This is when we start querying hub and minimizing persistent corpus.
phaseTriagedCorpus
// Done the first request to hub.
phaseQueriedHub
// Triaged all new inputs from hub.
// This is when we start reproducing crashes.
phaseTriagedHub
)
const currentDBVersion = 3
type Fuzzer struct {
name string
inputs []RPCInput
newMaxSignal signal.Signal
}
type Crash struct {
vmIndex int
hub bool // this crash was created based on a repro from hub
*report.Report
}
func main() {
if sys.GitRevision == "" {
Fatalf("Bad syz-manager build. Build with make, run bin/syz-manager.")
}
flag.Parse()
EnableLogCaching(1000, 1<<20)
cfg, err := mgrconfig.LoadFile(*flagConfig)
if err != nil {
Fatalf("%v", err)
}
target, err := prog.GetTarget(cfg.TargetOS, cfg.TargetArch)
if err != nil {
Fatalf("%v", err)
}
syscalls, err := mgrconfig.ParseEnabledSyscalls(target, cfg.Enable_Syscalls, cfg.Disable_Syscalls)
if err != nil {
Fatalf("%v", err)
}
initAllCover(cfg.TargetOS, cfg.TargetVMArch, cfg.Vmlinux)
RunManager(cfg, target, syscalls)
}
func RunManager(cfg *mgrconfig.Config, target *prog.Target, syscalls map[int]bool) {
var vmPool *vm.Pool
// Type "none" is a special case for debugging/development when manager
// does not start any VMs, but instead you start them manually
// and start syz-fuzzer there.
if cfg.Type != "none" {
env := mgrconfig.CreateVMEnv(cfg, *flagDebug)
var err error
vmPool, err = vm.Create(cfg.Type, env)
if err != nil {
Fatalf("%v", err)
}
}
crashdir := filepath.Join(cfg.Workdir, "crashes")
osutil.MkdirAll(crashdir)
var enabledSyscalls []int
for c := range syscalls {
enabledSyscalls = append(enabledSyscalls, c)
}
mgr := &Manager{
cfg: cfg,
vmPool: vmPool,
target: target,
crashdir: crashdir,
startTime: time.Now(),
stats: make(map[string]uint64),
crashTypes: make(map[string]bool),
enabledSyscalls: enabledSyscalls,
corpus: make(map[string]RPCInput),
disabledHashes: make(map[string]struct{}),
fuzzers: make(map[string]*Fuzzer),
fresh: true,
vmStop: make(chan bool),
hubReproQueue: make(chan *Crash, 10),
needMoreRepros: make(chan chan bool),
reproRequest: make(chan chan map[string]bool),
usedFiles: make(map[string]time.Time),
}
Logf(0, "loading corpus...")
var err error
mgr.corpusDB, err = db.Open(filepath.Join(cfg.Workdir, "corpus.db"))
if err != nil {
Fatalf("failed to open corpus database: %v", err)
}
// By default we don't re-minimize/re-smash programs from corpus,
// it takes lots of time on start and is unnecessary.
// However, on version bumps we can selectively re-minimize/re-smash.
minimized, smashed := true, true
switch mgr.corpusDB.Version {
case 0:
// Version 0 had broken minimization, so we need to re-minimize.
minimized = false
fallthrough
case 1:
// Version 1->2: memory is preallocated so lots of mmaps become unnecessary.
minimized = false
fallthrough
case 2:
// Version 2->3: big-endian hints.
smashed = false
fallthrough
case currentDBVersion:
}
deleted := 0
for key, rec := range mgr.corpusDB.Records {
p, err := mgr.target.Deserialize(rec.Val)
if err != nil {
if deleted < 10 {
Logf(0, "deleting broken program: %v\n%s", err, rec.Val)
}
mgr.corpusDB.Delete(key)
deleted++
continue
}
disabled := false
for _, c := range p.Calls {
if !syscalls[c.Meta.ID] {
disabled = true
break
}
}
if disabled {
// This program contains a disabled syscall.
// We won't execute it, but remember its hash so
// it is not deleted during minimization.
// TODO: use mgr.enabledCalls which accounts for missing devices, etc.
// But it is available only after vm check.
mgr.disabledHashes[hash.String(rec.Val)] = struct{}{}
continue
}
mgr.candidates = append(mgr.candidates, RPCCandidate{
Prog: rec.Val,
Minimized: minimized,
Smashed: smashed,
})
}
mgr.fresh = len(mgr.corpusDB.Records) == 0
Logf(0, "loaded %v programs (%v total, %v deleted)",
len(mgr.candidates), len(mgr.corpusDB.Records), deleted)
// Now this is ugly.
// We duplicate all inputs in the corpus and shuffle the second part.
// This solves the following problem. A fuzzer can crash while triaging candidates,
// in such case it will also lost all cached candidates. Or, the input can be somewhat flaky
// and doesn't give the coverage on first try. So we give each input the second chance.
// Shuffling should alleviate deterministically losing the same inputs on fuzzer crashing.
mgr.candidates = append(mgr.candidates, mgr.candidates...)
shuffle := mgr.candidates[len(mgr.candidates)/2:]
for i := range shuffle {
j := i + rand.Intn(len(shuffle)-i)
shuffle[i], shuffle[j] = shuffle[j], shuffle[i]
}
// Create HTTP server.
mgr.initHTTP()
mgr.collectUsedFiles()
// Create RPC server for fuzzers.
s, err := NewRPCServer(cfg.RPC, mgr)
if err != nil {
Fatalf("failed to create rpc server: %v", err)
}
Logf(0, "serving rpc on tcp://%v", s.Addr())
mgr.port = s.Addr().(*net.TCPAddr).Port
go s.Serve()
if cfg.Dashboard_Addr != "" {
mgr.dash = dashapi.New(cfg.Dashboard_Client, cfg.Dashboard_Addr, cfg.Dashboard_Key)
}
go func() {
for lastTime := time.Now(); ; {
time.Sleep(10 * time.Second)
now := time.Now()
diff := now.Sub(lastTime)
lastTime = now
mgr.mu.Lock()
if mgr.firstConnect.IsZero() {
mgr.mu.Unlock()
continue
}
mgr.fuzzingTime += diff * time.Duration(atomic.LoadUint32(&mgr.numFuzzing))
executed := mgr.stats["exec total"]
crashes := mgr.stats["crashes"]
signal := mgr.corpusSignal.Len()
mgr.mu.Unlock()
numReproducing := atomic.LoadUint32(&mgr.numReproducing)
numFuzzing := atomic.LoadUint32(&mgr.numFuzzing)
Logf(0, "VMs %v, executed %v, cover %v, crashes %v, repro %v",
numFuzzing, executed, signal, crashes, numReproducing)
}
}()
if *flagBench != "" {
f, err := os.OpenFile(*flagBench, os.O_WRONLY|os.O_CREATE|os.O_EXCL, osutil.DefaultFilePerm)
if err != nil {
Fatalf("failed to open bench file: %v", err)
}
go func() {
for {
time.Sleep(time.Minute)
vals := make(map[string]uint64)
mgr.mu.Lock()
if mgr.firstConnect.IsZero() {
mgr.mu.Unlock()
continue
}
mgr.minimizeCorpus()
vals["corpus"] = uint64(len(mgr.corpus))
vals["uptime"] = uint64(time.Since(mgr.firstConnect)) / 1e9
vals["fuzzing"] = uint64(mgr.fuzzingTime) / 1e9
vals["signal"] = uint64(mgr.corpusSignal.Len())
vals["coverage"] = uint64(len(mgr.corpusCover))
for k, v := range mgr.stats {
vals[k] = v
}
mgr.mu.Unlock()
data, err := json.MarshalIndent(vals, "", " ")
if err != nil {
Fatalf("failed to serialize bench data")
}
if _, err := f.Write(append(data, '\n')); err != nil {
Fatalf("failed to write bench data")
}
}
}()
}
if mgr.dash != nil {
go mgr.dashboardReporter()
}
if mgr.cfg.Hub_Client != "" {
go func() {
for {
time.Sleep(time.Minute)
mgr.hubSync()
}
}()
}
osutil.HandleInterrupts(vm.Shutdown)
if mgr.vmPool == nil {
Logf(0, "no VMs started (type=none)")
Logf(0, "you are supposed to start syz-fuzzer manually as:")
Logf(0, "syz-fuzzer -manager=manager.ip:%v [other flags as necessary]", mgr.port)
<-vm.Shutdown
return
}
mgr.vmLoop()
}
type RunResult struct {
idx int
crash *Crash
err error
}
type ReproResult struct {
instances []int
title0 string
res *repro.Result
err error
hub bool // repro came from hub
}
func (mgr *Manager) vmLoop() {
Logf(0, "booting test machines...")
Logf(0, "wait for the connection from test machine...")
instancesPerRepro := 4
vmCount := mgr.vmPool.Count()
if instancesPerRepro > vmCount {
instancesPerRepro = vmCount
}
instances := make([]int, vmCount)
for i := range instances {
instances[i] = vmCount - i - 1
}
runDone := make(chan *RunResult, 1)
pendingRepro := make(map[*Crash]bool)
reproducing := make(map[string]bool)
reproInstances := 0
var reproQueue []*Crash
reproDone := make(chan *ReproResult, 1)
stopPending := false
shutdown := vm.Shutdown
for {
mgr.mu.Lock()
phase := mgr.phase
mgr.mu.Unlock()
for crash := range pendingRepro {
if reproducing[crash.Title] {
continue
}
delete(pendingRepro, crash)
if !crash.hub {
if mgr.dash == nil {
if !mgr.needRepro(crash) {
continue
}
} else {
cid := &dashapi.CrashID{
BuildID: mgr.cfg.Tag,
Title: crash.Title,
Corrupted: crash.Corrupted,
}
needRepro, err := mgr.dash.NeedRepro(cid)
if err != nil {
Logf(0, "dashboard.NeedRepro failed: %v", err)
}
if !needRepro {
continue
}
}
}
Logf(1, "loop: add to repro queue '%v'", crash.Title)
reproducing[crash.Title] = true
reproQueue = append(reproQueue, crash)
}
Logf(1, "loop: phase=%v shutdown=%v instances=%v/%v %+v repro: pending=%v reproducing=%v queued=%v",
phase, shutdown == nil, len(instances), vmCount, instances,
len(pendingRepro), len(reproducing), len(reproQueue))
canRepro := func() bool {
return phase >= phaseTriagedHub &&
len(reproQueue) != 0 && reproInstances+instancesPerRepro <= vmCount
}
if shutdown == nil {
if len(instances) == vmCount {
return
}
} else {
for canRepro() && len(instances) >= instancesPerRepro {
last := len(reproQueue) - 1
crash := reproQueue[last]
reproQueue[last] = nil
reproQueue = reproQueue[:last]
vmIndexes := append([]int{}, instances[len(instances)-instancesPerRepro:]...)
instances = instances[:len(instances)-instancesPerRepro]
reproInstances += instancesPerRepro
atomic.AddUint32(&mgr.numReproducing, 1)
Logf(1, "loop: starting repro of '%v' on instances %+v", crash.Title, vmIndexes)
go func() {
res, err := repro.Run(crash.Output, mgr.cfg, mgr.getReporter(), mgr.vmPool, vmIndexes)
reproDone <- &ReproResult{vmIndexes, crash.Title, res, err, crash.hub}
}()
}
for !canRepro() && len(instances) != 0 {
last := len(instances) - 1
idx := instances[last]
instances = instances[:last]
Logf(1, "loop: starting instance %v", idx)
go func() {
crash, err := mgr.runInstance(idx)
runDone <- &RunResult{idx, crash, err}
}()
}
}
var stopRequest chan bool
if !stopPending && canRepro() {
stopRequest = mgr.vmStop
}
select {
case stopRequest <- true:
Logf(1, "loop: issued stop request")
stopPending = true
case res := <-runDone:
Logf(1, "loop: instance %v finished, crash=%v", res.idx, res.crash != nil)
if res.err != nil && shutdown != nil {
Logf(0, "%v", res.err)
}
stopPending = false
instances = append(instances, res.idx)
// On shutdown qemu crashes with "qemu: terminating on signal 2",
// which we detect as "lost connection". Don't save that as crash.
if shutdown != nil && res.crash != nil && !mgr.isSuppressed(res.crash) {
needRepro := mgr.saveCrash(res.crash)
if needRepro {
Logf(1, "loop: add pending repro for '%v'", res.crash.Title)
pendingRepro[res.crash] = true
}
}
case res := <-reproDone:
atomic.AddUint32(&mgr.numReproducing, ^uint32(0))
crepro := false
title := ""
if res.res != nil {
crepro = res.res.CRepro
title = res.res.Report.Title
}
Logf(1, "loop: repro on %+v finished '%v', repro=%v crepro=%v desc='%v'",
res.instances, res.title0, res.res != nil, crepro, title)
if res.err != nil {
Logf(0, "repro failed: %v", res.err)
}
delete(reproducing, res.title0)
instances = append(instances, res.instances...)
reproInstances -= instancesPerRepro
if res.res == nil {
if !res.hub {
mgr.saveFailedRepro(res.title0)
}
} else {
mgr.saveRepro(res.res, res.hub)
}
case <-shutdown:
Logf(1, "loop: shutting down...")
shutdown = nil
case crash := <-mgr.hubReproQueue:
Logf(1, "loop: get repro from hub")
pendingRepro[crash] = true
case reply := <-mgr.needMoreRepros:
reply <- phase >= phaseTriagedHub &&
len(reproQueue)+len(pendingRepro)+len(reproducing) == 0
case reply := <-mgr.reproRequest:
repros := make(map[string]bool)
for title := range reproducing {
repros[title] = true
}
reply <- repros
}
}
}
func (mgr *Manager) runInstance(index int) (*Crash, error) {
mgr.checkUsedFiles()
inst, err := mgr.vmPool.Create(index)
if err != nil {
return nil, fmt.Errorf("failed to create instance: %v", err)
}
defer inst.Close()
fwdAddr, err := inst.Forward(mgr.port)
if err != nil {
return nil, fmt.Errorf("failed to setup port forwarding: %v", err)
}
fuzzerBin, err := inst.Copy(mgr.cfg.SyzFuzzerBin)
if err != nil {
return nil, fmt.Errorf("failed to copy binary: %v", err)
}
executorBin, err := inst.Copy(mgr.cfg.SyzExecutorBin)
if err != nil {
return nil, fmt.Errorf("failed to copy binary: %v", err)
}
// Leak detection significantly slows down fuzzing, so detect leaks only on the first instance.
leak := mgr.cfg.Leak && index == 0
fuzzerV := 0
procs := mgr.cfg.Procs
if *flagDebug {
fuzzerV = 100
procs = 1
}
// Run the fuzzer binary.
start := time.Now()
atomic.AddUint32(&mgr.numFuzzing, 1)
defer atomic.AddUint32(&mgr.numFuzzing, ^uint32(0))
cmd := fmt.Sprintf("%v -executor=%v -name=vm-%v -arch=%v -manager=%v -procs=%v"+
" -leak=%v -cover=%v -sandbox=%v -debug=%v -v=%d",
fuzzerBin, executorBin, index, mgr.cfg.TargetArch, fwdAddr, procs,
leak, mgr.cfg.Cover, mgr.cfg.Sandbox, *flagDebug, fuzzerV)
outc, errc, err := inst.Run(time.Hour, mgr.vmStop, cmd)
if err != nil {
return nil, fmt.Errorf("failed to run fuzzer: %v", err)
}
rep := vm.MonitorExecution(outc, errc, mgr.getReporter(), false)
if rep == nil {
// This is the only "OK" outcome.
Logf(0, "vm-%v: running for %v, restarting", index, time.Since(start))
return nil, nil
}
cash := &Crash{
vmIndex: index,
hub: false,
Report: rep,
}
return cash, nil
}
func (mgr *Manager) isSuppressed(crash *Crash) bool {
for _, re := range mgr.cfg.ParsedSuppressions {
if !re.Match(crash.Output) {
continue
}
Logf(1, "vm-%v: suppressing '%v' with '%v'", crash.vmIndex, crash.Title, re.String())
mgr.mu.Lock()
mgr.stats["suppressed"]++
mgr.mu.Unlock()
return true
}
return false
}
func (mgr *Manager) emailCrash(crash *Crash) {
if len(mgr.cfg.Email_Addrs) == 0 {
return
}
args := []string{"-s", "syzkaller: " + crash.Title}
args = append(args, mgr.cfg.Email_Addrs...)
Logf(0, "sending email to %v", mgr.cfg.Email_Addrs)
cmd := exec.Command("mailx", args...)
cmd.Stdin = bytes.NewReader(crash.Report.Report)
if _, err := osutil.Run(10*time.Minute, cmd); err != nil {
Logf(0, "failed to send email: %v", err)
}
}
func (mgr *Manager) saveCrash(crash *Crash) bool {
corrupted := ""
if crash.Corrupted {
corrupted = " [corrupted]"
}
Logf(0, "vm-%v: crash: %v%v", crash.vmIndex, crash.Title, corrupted)
if err := mgr.getReporter().Symbolize(crash.Report); err != nil {
Logf(0, "failed to symbolize report: %v", err)
}
mgr.mu.Lock()
mgr.stats["crashes"]++
if !mgr.crashTypes[crash.Title] {
mgr.crashTypes[crash.Title] = true
mgr.stats["crash types"]++
}
mgr.mu.Unlock()
if mgr.dash != nil {
dc := &dashapi.Crash{
BuildID: mgr.cfg.Tag,
Title: crash.Title,
Corrupted: crash.Corrupted,
Maintainers: crash.Maintainers,
Log: crash.Output,
Report: crash.Report.Report,
}
resp, err := mgr.dash.ReportCrash(dc)
if err != nil {
Logf(0, "failed to report crash to dashboard: %v", err)
} else {
// Don't store the crash locally, if we've successfully
// uploaded it to the dashboard. These will just eat disk space.
return resp.NeedRepro
}
}
sig := hash.Hash([]byte(crash.Title))
id := sig.String()
dir := filepath.Join(mgr.crashdir, id)
osutil.MkdirAll(dir)
if err := osutil.WriteFile(filepath.Join(dir, "description"), []byte(crash.Title+"\n")); err != nil {
Logf(0, "failed to write crash: %v", err)
}
// Save up to 100 reports. If we already have 100, overwrite the oldest one.
// Newer reports are generally more useful. Overwriting is also needed
// to be able to understand if a particular bug still happens or already fixed.
oldestI := 0
var oldestTime time.Time
for i := 0; i < 100; i++ {
info, err := os.Stat(filepath.Join(dir, fmt.Sprintf("log%v", i)))
if err != nil {
oldestI = i
if i == 0 {
go mgr.emailCrash(crash)
}
break
}
if oldestTime.IsZero() || info.ModTime().Before(oldestTime) {
oldestI = i
oldestTime = info.ModTime()
}
}
osutil.WriteFile(filepath.Join(dir, fmt.Sprintf("log%v", oldestI)), crash.Output)
if len(mgr.cfg.Tag) > 0 {
osutil.WriteFile(filepath.Join(dir, fmt.Sprintf("tag%v", oldestI)), []byte(mgr.cfg.Tag))
}
if len(crash.Report.Report) > 0 {
osutil.WriteFile(filepath.Join(dir, fmt.Sprintf("report%v", oldestI)), crash.Report.Report)
}
return mgr.needRepro(crash)
}
const maxReproAttempts = 3
func (mgr *Manager) needRepro(crash *Crash) bool {
if !mgr.cfg.Reproduce || crash.Corrupted {
return false
}
sig := hash.Hash([]byte(crash.Title))
dir := filepath.Join(mgr.crashdir, sig.String())
if osutil.IsExist(filepath.Join(dir, "repro.prog")) {
return false
}
for i := 0; i < maxReproAttempts; i++ {
if !osutil.IsExist(filepath.Join(dir, fmt.Sprintf("repro%v", i))) {
return true
}
}
return false
}
func (mgr *Manager) saveFailedRepro(desc string) {
if mgr.dash != nil {
cid := &dashapi.CrashID{
BuildID: mgr.cfg.Tag,
Title: desc,
}
if err := mgr.dash.ReportFailedRepro(cid); err != nil {
Logf(0, "failed to report failed repro to dashboard: %v", err)
}
}
dir := filepath.Join(mgr.crashdir, hash.String([]byte(desc)))
osutil.MkdirAll(dir)
for i := 0; i < maxReproAttempts; i++ {
name := filepath.Join(dir, fmt.Sprintf("repro%v", i))
if !osutil.IsExist(name) {
osutil.WriteFile(name, nil)
break
}
}
}
func (mgr *Manager) saveRepro(res *repro.Result, hub bool) {
rep := res.Report
if err := mgr.getReporter().Symbolize(rep); err != nil {
Logf(0, "failed to symbolize repro: %v", err)
}
dir := filepath.Join(mgr.crashdir, hash.String([]byte(rep.Title)))
osutil.MkdirAll(dir)
if err := osutil.WriteFile(filepath.Join(dir, "description"), []byte(rep.Title+"\n")); err != nil {
Logf(0, "failed to write crash: %v", err)
}
opts := fmt.Sprintf("# %+v\n", res.Opts)
prog := res.Prog.Serialize()
osutil.WriteFile(filepath.Join(dir, "repro.prog"), append([]byte(opts), prog...))
if len(mgr.cfg.Tag) > 0 {
osutil.WriteFile(filepath.Join(dir, "repro.tag"), []byte(mgr.cfg.Tag))
}
if len(rep.Output) > 0 {
osutil.WriteFile(filepath.Join(dir, "repro.log"), rep.Output)
}
if len(rep.Report) > 0 {
osutil.WriteFile(filepath.Join(dir, "repro.report"), rep.Report)
}
osutil.WriteFile(filepath.Join(dir, "repro.stats.log"), res.Stats.Log)
stats := fmt.Sprintf("Extracting prog: %s\nMinimizing prog: %s\nSimplifying prog options: %s\nExtracting C: %s\nSimplifying C: %s\n",
res.Stats.ExtractProgTime, res.Stats.MinimizeProgTime, res.Stats.SimplifyProgTime, res.Stats.ExtractCTime, res.Stats.SimplifyCTime)
osutil.WriteFile(filepath.Join(dir, "repro.stats"), []byte(stats))
var cprogText []byte
if res.CRepro {
cprog, err := csource.Write(res.Prog, res.Opts)
if err == nil {
formatted, err := csource.Format(cprog)
if err == nil {
cprog = formatted
}
osutil.WriteFile(filepath.Join(dir, "repro.cprog"), cprog)
cprogText = cprog
} else {
Logf(0, "failed to write C source: %v", err)
}
}
// Append this repro to repro list to send to hub if it didn't come from hub originally.
if !hub {
progForHub := []byte(fmt.Sprintf("# %+v\n# %v\n# %v\n%s",
res.Opts, res.Report.Title, mgr.cfg.Tag, prog))
mgr.mu.Lock()
mgr.newRepros = append(mgr.newRepros, progForHub)
mgr.mu.Unlock()
}
if mgr.dash != nil {
// Note: we intentionally don't set Corrupted for reproducers:
// 1. This is reproducible so can be debugged even with corrupted report.
// 2. Repro re-tried 3 times and still got corrupted report at the end,
// so maybe corrupted report detection is broken.
// 3. Reproduction is expensive so it's good to persist the result.
dc := &dashapi.Crash{
BuildID: mgr.cfg.Tag,
Title: res.Report.Title,
Maintainers: res.Report.Maintainers,
Log: res.Report.Output,
Report: res.Report.Report,
ReproOpts: res.Opts.Serialize(),
ReproSyz: res.Prog.Serialize(),
ReproC: cprogText,
}
if _, err := mgr.dash.ReportCrash(dc); err != nil {
Logf(0, "failed to report repro to dashboard: %v", err)
}
}
}
func (mgr *Manager) getReporter() report.Reporter {
if mgr.reporter == nil {
<-allSymbolsReady
var err error
// TODO(dvyukov): we should introduce cfg.Kernel_Obj dir instead of Vmlinux.
// This will be more general taking into account modules and other OSes.
kernelSrc, kernelObj := "", ""
if mgr.cfg.Vmlinux != "" {
kernelSrc = mgr.cfg.Kernel_Src
kernelObj = filepath.Dir(mgr.cfg.Vmlinux)
}
mgr.reporter, err = report.NewReporter(mgr.cfg.TargetOS, kernelSrc, kernelObj,
allSymbols, mgr.cfg.ParsedIgnores)
if err != nil {
Fatalf("%v", err)
}
}
return mgr.reporter
}
func (mgr *Manager) minimizeCorpus() {
if mgr.cfg.Cover && len(mgr.corpus) != 0 {
inputs := make([]signal.Context, 0, len(mgr.corpus))
for _, inp := range mgr.corpus {
inputs = append(inputs, signal.Context{
Signal: inp.Signal.Deserialize(),
Context: inp,
})
}
newCorpus := make(map[string]RPCInput)
for _, ctx := range signal.Minimize(inputs) {
inp := ctx.(RPCInput)
newCorpus[hash.String(inp.Prog)] = inp
}
Logf(1, "minimized corpus: %v -> %v", len(mgr.corpus), len(newCorpus))
mgr.corpus = newCorpus
}
// Don't minimize persistent corpus until fuzzers have triaged all inputs from it.
if mgr.phase >= phaseTriagedCorpus {
for key := range mgr.corpusDB.Records {
_, ok1 := mgr.corpus[key]
_, ok2 := mgr.disabledHashes[key]
if !ok1 && !ok2 {
mgr.corpusDB.Delete(key)
}
}
mgr.corpusDB.BumpVersion(currentDBVersion)
}
}
func (mgr *Manager) Connect(a *ConnectArgs, r *ConnectRes) error {
Logf(1, "fuzzer %v connected", a.Name)
mgr.mu.Lock()
defer mgr.mu.Unlock()
if mgr.firstConnect.IsZero() {
mgr.firstConnect = time.Now()
Logf(0, "received first connection from test machine %v", a.Name)
}
mgr.stats["vm restarts"]++
f := &Fuzzer{
name: a.Name,
}
mgr.fuzzers[a.Name] = f
mgr.minimizeCorpus()
if mgr.prios == nil || time.Since(mgr.lastPrioCalc) > 30*time.Minute {
// Deserializing all programs is slow, so we do it episodically and without holding the mutex.
mgr.lastPrioCalc = time.Now()
inputs := make([][]byte, 0, len(mgr.corpus))
for _, inp := range mgr.corpus {
inputs = append(inputs, inp.Prog)
}
mgr.mu.Unlock()
corpus := make([]*prog.Prog, 0, len(inputs))
for _, inp := range inputs {
p, err := mgr.target.Deserialize(inp)
if err != nil {
panic(err)
}
corpus = append(corpus, p)
}
prios := mgr.target.CalculatePriorities(corpus)
mgr.mu.Lock()
mgr.prios = prios
}
for _, inp := range mgr.corpus {
r.Inputs = append(r.Inputs, inp)
}
r.Prios = mgr.prios
r.EnabledCalls = mgr.enabledSyscalls
r.NeedCheck = !mgr.vmChecked
r.MaxSignal = mgr.maxSignal.Serialize()
for i := 0; i < mgr.cfg.Procs && len(mgr.candidates) > 0; i++ {
last := len(mgr.candidates) - 1
r.Candidates = append(r.Candidates, mgr.candidates[last])
mgr.candidates = mgr.candidates[:last]
}
if len(mgr.candidates) == 0 {
mgr.candidates = nil
}
return nil
}
func (mgr *Manager) Check(a *CheckArgs, r *int) error {
mgr.mu.Lock()
defer mgr.mu.Unlock()
if mgr.vmChecked {
return nil
}
Logf(0, "machine check: %v calls enabled, kcov=%v, kleakcheck=%v, faultinjection=%v, comps=%v",
len(a.Calls), a.Kcov, a.Leak, a.Fault, a.CompsSupported)
if mgr.cfg.Cover && !a.Kcov {
Fatalf("/sys/kernel/debug/kcov is missing on target machine. Enable CONFIG_KCOV and mount debugfs")
}
if mgr.cfg.Sandbox == "namespace" && !a.UserNamespaces {
Fatalf("/proc/self/ns/user is missing on target machine or permission is denied. Can't use requested namespace sandbox. Enable CONFIG_USER_NS")
}
if mgr.vmPool != nil {
if mgr.target.Arch != a.ExecutorArch {
Fatalf("mismatching target/executor arch: target=%v executor=%v",
mgr.target.Arch, a.ExecutorArch)
}
if sys.GitRevision != a.FuzzerGitRev || sys.GitRevision != a.ExecutorGitRev {
Fatalf("syz-manager, syz-fuzzer and syz-executor binaries are built on different git revisions\n"+
"manager= %v\nfuzzer= %v\nexecutor=%v\n"+
"this is not supported, rebuild all binaries with make",
sys.GitRevision, a.FuzzerGitRev, a.ExecutorGitRev)
}
if mgr.target.Revision != a.FuzzerSyzRev || mgr.target.Revision != a.ExecutorSyzRev {
Fatalf("syz-manager, syz-fuzzer and syz-executor binaries have different versions of system call descriptions compiled in\n"+
"manager= %v\nfuzzer= %v\nexecutor=%v\n"+
"this is not supported, rebuild all binaries with make",
mgr.target.Revision, a.FuzzerSyzRev, a.ExecutorSyzRev)
}
}
if len(mgr.cfg.Enable_Syscalls) != 0 && len(a.DisabledCalls) != 0 {
disabled := make(map[string]string)
for _, dc := range a.DisabledCalls {
disabled[dc.Name] = dc.Reason
}
for _, id := range mgr.enabledSyscalls {
name := mgr.target.Syscalls[id].Name
if reason := disabled[name]; reason != "" {
Logf(0, "disabling %v: %v", name, reason)
}
}
}
if len(a.Calls) == 0 {
Fatalf("all system calls are disabled")
}
mgr.vmChecked = true
mgr.enabledCalls = a.Calls
return nil
}
func (mgr *Manager) NewInput(a *NewInputArgs, r *int) error {
inputSignal := a.Signal.Deserialize()
Logf(4, "new input from %v for syscall %v (signal=%v, cover=%v)",
a.Name, a.Call, inputSignal.Len(), len(a.Cover))
mgr.mu.Lock()
defer mgr.mu.Unlock()
f := mgr.fuzzers[a.Name]
if f == nil {
Fatalf("fuzzer %v is not connected", a.Name)
}
if _, err := mgr.target.Deserialize(a.RPCInput.Prog); err != nil {
// This should not happen, but we see such cases episodically, reason unknown.
Logf(0, "failed to deserialize program from fuzzer: %v\n%s", err, a.RPCInput.Prog)
return nil
}
if mgr.corpusSignal.Diff(inputSignal).Empty() {
return nil
}
mgr.stats["manager new inputs"]++
mgr.corpusSignal.Merge(inputSignal)
mgr.corpusCover.Merge(a.Cover)
sig := hash.String(a.RPCInput.Prog)
if inp, ok := mgr.corpus[sig]; ok {
// The input is already present, but possibly with diffent signal/coverage/call.
inputSignal.Merge(inp.Signal.Deserialize())
inp.Signal = inputSignal.Serialize()
var inputCover cover.Cover
inputCover.Merge(inp.Cover)
inputCover.Merge(a.RPCInput.Cover)
inp.Cover = inputCover.Serialize()
mgr.corpus[sig] = inp
} else {
mgr.corpus[sig] = a.RPCInput
mgr.corpusDB.Save(sig, a.RPCInput.Prog, 0)
if err := mgr.corpusDB.Flush(); err != nil {
Logf(0, "failed to save corpus database: %v", err)
}
for _, f1 := range mgr.fuzzers {
if f1 == f {
continue
}
inp := a.RPCInput
inp.Cover = nil // Don't send coverage back to all fuzzers.
f1.inputs = append(f1.inputs, inp)
}
}
return nil
}
func (mgr *Manager) Poll(a *PollArgs, r *PollRes) error {
mgr.mu.Lock()
defer mgr.mu.Unlock()
for k, v := range a.Stats {
mgr.stats[k] += v
}
f := mgr.fuzzers[a.Name]
if f == nil {
Fatalf("fuzzer %v is not connected", a.Name)
}
newMaxSignal := mgr.maxSignal.Diff(a.MaxSignal.Deserialize())
if !newMaxSignal.Empty() {
mgr.maxSignal.Merge(newMaxSignal)
for _, f1 := range mgr.fuzzers {
if f1 == f {
continue
}
f1.newMaxSignal.Merge(newMaxSignal)
}
}
if !f.newMaxSignal.Empty() {
r.MaxSignal = f.newMaxSignal.Serialize()
f.newMaxSignal = nil
}
for i := 0; i < 100 && len(f.inputs) > 0; i++ {
last := len(f.inputs) - 1
r.NewInputs = append(r.NewInputs, f.inputs[last])
f.inputs = f.inputs[:last]
}
if len(f.inputs) == 0 {
f.inputs = nil
}
if a.NeedCandidates {
for i := 0; i < mgr.cfg.Procs && len(mgr.candidates) > 0; i++ {
last := len(mgr.candidates) - 1
r.Candidates = append(r.Candidates, mgr.candidates[last])
mgr.candidates = mgr.candidates[:last]
}
}
if len(mgr.candidates) == 0 {
mgr.candidates = nil
if mgr.phase == phaseInit {
if mgr.cfg.Hub_Client != "" {
mgr.phase = phaseTriagedCorpus
} else {
mgr.phase = phaseTriagedHub
}
}
}
Logf(4, "poll from %v: candidates=%v inputs=%v", a.Name, len(r.Candidates), len(r.NewInputs))
return nil
}
func (mgr *Manager) hubSync() {
mgr.mu.Lock()
defer mgr.mu.Unlock()
switch mgr.phase {
case phaseInit:
return
case phaseTriagedCorpus:
mgr.phase = phaseQueriedHub
case phaseQueriedHub:
if len(mgr.candidates) == 0 {
mgr.phase = phaseTriagedHub
}
case phaseTriagedHub:
default:
panic("unknown phase")
}
mgr.minimizeCorpus()
if mgr.hub == nil {
a := &HubConnectArgs{
Client: mgr.cfg.Hub_Client,
Key: mgr.cfg.Hub_Key,
Manager: mgr.cfg.Name,
Fresh: mgr.fresh,
Calls: mgr.enabledCalls,
}
hubCorpus := make(map[hash.Sig]bool)
for _, inp := range mgr.corpus {
hubCorpus[hash.Hash(inp.Prog)] = true
a.Corpus = append(a.Corpus, inp.Prog)
}
mgr.mu.Unlock()
// Hub.Connect request can be very large, so do it on a transient connection
// (rpc connection buffers never shrink).
// Also don't do hub rpc's under the mutex -- hub can be slow or inaccessible.
if err := RPCCall(mgr.cfg.Hub_Addr, "Hub.Connect", a, nil); err != nil {
mgr.mu.Lock()
Logf(0, "Hub.Connect rpc failed: %v", err)
return
}
conn, err := NewRPCClient(mgr.cfg.Hub_Addr)
if err != nil {
mgr.mu.Lock()
Logf(0, "failed to connect to hub at %v: %v", mgr.cfg.Hub_Addr, err)
return
}
mgr.mu.Lock()
mgr.hub = conn
mgr.hubCorpus = hubCorpus
mgr.fresh = false
Logf(0, "connected to hub at %v, corpus %v", mgr.cfg.Hub_Addr, len(mgr.corpus))
}
a := &HubSyncArgs{
Client: mgr.cfg.Hub_Client,
Key: mgr.cfg.Hub_Key,
Manager: mgr.cfg.Name,
}
corpus := make(map[hash.Sig]bool)
for _, inp := range mgr.corpus {
sig := hash.Hash(inp.Prog)
corpus[sig] = true
if mgr.hubCorpus[sig] {
continue
}
mgr.hubCorpus[sig] = true
a.Add = append(a.Add, inp.Prog)
}
for sig := range mgr.hubCorpus {
if corpus[sig] {
continue
}
delete(mgr.hubCorpus, sig)
a.Del = append(a.Del, sig.String())
}
for {
a.Repros = mgr.newRepros
mgr.mu.Unlock()
if mgr.cfg.Reproduce && mgr.dash != nil {
needReproReply := make(chan bool)
mgr.needMoreRepros <- needReproReply
a.NeedRepros = <-needReproReply
}
r := new(HubSyncRes)
if err := mgr.hub.Call("Hub.Sync", a, r); err != nil {
mgr.mu.Lock()
Logf(0, "Hub.Sync rpc failed: %v", err)
mgr.hub.Close()
mgr.hub = nil
return
}
reproDropped := 0
for _, repro := range r.Repros {
_, err := mgr.target.Deserialize(repro)
if err != nil {
reproDropped++
continue
}
mgr.hubReproQueue <- &Crash{
vmIndex: -1,
hub: true,
Report: &report.Report{
Title: "external repro",
Output: repro,
},
}
}
mgr.mu.Lock()
mgr.newRepros = nil
dropped := 0
for _, inp := range r.Progs {
_, err := mgr.target.Deserialize(inp)
if err != nil {
dropped++
continue
}
mgr.candidates = append(mgr.candidates, RPCCandidate{
Prog: inp,
Minimized: false, // don't trust programs from hub
Smashed: false,
})
}
mgr.stats["hub add"] += uint64(len(a.Add))
mgr.stats["hub del"] += uint64(len(a.Del))
mgr.stats["hub drop"] += uint64(dropped)
mgr.stats["hub new"] += uint64(len(r.Progs) - dropped)
mgr.stats["hub sent repros"] += uint64(len(a.Repros))
mgr.stats["hub recv repros"] += uint64(len(r.Repros) - reproDropped)
Logf(0, "hub sync: send: add %v, del %v, repros %v; recv: progs: drop %v, new %v, repros: drop: %v, new %v; more %v",
len(a.Add), len(a.Del), len(a.Repros), dropped, len(r.Progs)-dropped, reproDropped, len(r.Repros)-reproDropped, r.More)
if len(r.Progs)+r.More == 0 {
break
}
a.Add = nil
a.Del = nil
}
}
func (mgr *Manager) collectUsedFiles() {
if mgr.vmPool == nil {
return
}
addUsedFile := func(f string) {
if f == "" {
return
}
stat, err := os.Stat(f)
if err != nil {
Fatalf("failed to stat %v: %v", f, err)
}
mgr.usedFiles[f] = stat.ModTime()
}
cfg := mgr.cfg
addUsedFile(cfg.SyzFuzzerBin)
addUsedFile(cfg.SyzExecprogBin)
addUsedFile(cfg.SyzExecutorBin)
addUsedFile(cfg.SSHKey)
addUsedFile(cfg.Vmlinux)
if cfg.Image != "9p" {
addUsedFile(cfg.Image)
}
}
func (mgr *Manager) checkUsedFiles() {
for f, mod := range mgr.usedFiles {
stat, err := os.Stat(f)
if err != nil {
Fatalf("failed to stat %v: %v", f, err)
}
if mod != stat.ModTime() {
Fatalf("file %v that syz-manager uses has been modified by an external program\n"+
"this can lead to arbitrary syz-manager misbehavior\n"+
"modification time has changed: %v -> %v\n"+
"don't modify files that syz-manager uses. exiting to prevent harm",
f, mod, stat.ModTime())
}
}
}
func (mgr *Manager) dashboardReporter() {
webAddr := publicWebAddr(mgr.cfg.HTTP)
var lastFuzzingTime time.Duration
var lastCrashes, lastExecs uint64
for {
time.Sleep(time.Minute)
mgr.mu.Lock()
if mgr.firstConnect.IsZero() {
mgr.mu.Unlock()
continue
}
crashes := mgr.stats["crashes"]
execs := mgr.stats["exec total"]
req := &dashapi.ManagerStatsReq{
Name: mgr.cfg.Name,
Addr: webAddr,
UpTime: time.Since(mgr.firstConnect),
Corpus: uint64(len(mgr.corpus)),
Cover: uint64(mgr.corpusSignal.Len()),
FuzzingTime: mgr.fuzzingTime - lastFuzzingTime,
Crashes: crashes - lastCrashes,
Execs: execs - lastExecs,
}
mgr.mu.Unlock()
if err := mgr.dash.UploadManagerStats(req); err != nil {
Logf(0, "faield to upload dashboard stats: %v", err)
continue
}
mgr.mu.Lock()
lastFuzzingTime += req.FuzzingTime
lastCrashes += req.Crashes
lastExecs += req.Execs
mgr.mu.Unlock()
}
}
func publicWebAddr(addr string) string {
_, port, err := net.SplitHostPort(addr)
if err == nil && port != "" {
if host, err := os.Hostname(); err == nil {
addr = net.JoinHostPort(host, port)
}
if GCE, err := gce.NewContext(); err == nil {
addr = net.JoinHostPort(GCE.ExternalIP, port)
}
}
return "http://" + addr
}