internal/gcimporter: use two-level file index

This change introduces a two-level index of files, as a precursor to an optimization in which only the line number information for the necessary positions is recorded. The optimization naturally requires two passes over the data, which means we can't emit the file information in one gulp. Change-Id: Ia8e015c8b19cbf6074661ec345c7360a325d1054 Reviewed-on: https://go-review.googlesource.com/c/tools/+/462095 Reviewed-by: Robert Findley <rfindley@google.com> Run-TryBot: Alan Donovan <adonovan@google.com> gopls-CI: kokoro <noreply+kokoro@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-01-13 11:44:12 -05:00 · 2023-01-13 11:44:12 -05:00 · d958e85480
--- a/internal/gcimporter/iexport.go
+++ b/internal/gcimporter/iexport.go
@ -102,7 +102,6 @@ func iexportCommon(out io.Writer, fset *token.FileSet, bundle, shallow bool, ver
 		shallow:     shallow,
 		allPkgs:     map[*types.Package]bool{},
 		stringIndex: map[string]uint64{},
-		fileIndex:   map[*token.File]uint64{},
 		declIndex:   map[types.Object]uint64{},
 		tparamNames: map[types.Object]string{},
 		typIndex:    map[types.Type]uint64{},
@ -141,6 +140,34 @@ func iexportCommon(out io.Writer, fset *token.FileSet, bundle, shallow bool, ver
 		p.doDecl(p.declTodo.popHead())
 	}

+	// Produce index of offset of each file record in files.
+	var files intWriter
+	var fileOffset []uint64 // fileOffset[i] is offset in files of file encoded as i
+	if p.shallow {
+		fileOffset = make([]uint64, len(p.fileInfos))
+		for i, info := range p.fileInfos {
+			fileOffset[i] = uint64(files.Len())
+
+			files.uint64(p.stringOff(info.file.Name()))
+			files.uint64(uint64(info.file.Size()))
+
+			// Delta-encode the line offsets, omitting the initial zero.
+			// (An empty file has an empty lines array.)
+			//
+			// TODO(adonovan): opt: use a two-pass approach that
+			// first gathers the set of Pos values and then
+			// encodes only the information necessary for them.
+			// This would allow us to discard the lines after the
+			// last object of interest and to run-length encode the
+			// trivial lines between lines with needed positions.
+			lines := getLines(info.file)
+			files.uint64(uint64(len(lines)))
+			for i := 1; i < len(lines); i++ {
+				files.uint64(uint64(lines[i] - lines[i-1]))
+			}
+		}
+	}
+
 	// Append indices to data0 section.
 	dataLen := uint64(p.data0.Len())
 	w := p.newWriter()
@ -167,7 +194,11 @@ func iexportCommon(out io.Writer, fset *token.FileSet, bundle, shallow bool, ver
 	hdr.uint64(uint64(p.version))
 	hdr.uint64(uint64(p.strings.Len()))
 	if p.shallow {
-		hdr.uint64(uint64(p.files.Len()))
+		hdr.uint64(uint64(files.Len()))
+		hdr.uint64(uint64(len(fileOffset)))
+		for _, offset := range fileOffset {
+			hdr.uint64(offset)
+		}
 	}
 	hdr.uint64(dataLen)

@ -175,7 +206,7 @@ func iexportCommon(out io.Writer, fset *token.FileSet, bundle, shallow bool, ver
 	io.Copy(out, &hdr)
 	io.Copy(out, &p.strings)
 	if p.shallow {
-		io.Copy(out, &p.files)
+		io.Copy(out, &files)
 	}
 	io.Copy(out, &p.data0)

@ -266,8 +297,9 @@ type iexporter struct {

 	// In shallow mode, object positions are encoded as (file, offset).
 	// Each file is recorded as a line-number table.
-	files     intWriter
-	fileIndex map[*token.File]uint64
+	// Only the lines of needed positions are saved faithfully.
+	fileInfo  map[*token.File]uint64 // value is index in fileInfos
+	fileInfos []*filePositions

 	data0       intWriter
 	declIndex   map[types.Object]uint64
@ -277,6 +309,11 @@ type iexporter struct {
 	indent int // for tracing support
 }

+type filePositions struct {
+	file   *token.File
+	needed []token.Pos // unordered list of needed positions
+}
+
 func (p *iexporter) trace(format string, args ...interface{}) {
 	if !trace {
 		// Call sites should also be guarded, but having this check here allows
@ -300,33 +337,21 @@ func (p *iexporter) stringOff(s string) uint64 {
 	return off
 }

-// fileOff returns the offset of the token.File encoding.
-// If not already present, it's added to the end.
-func (p *iexporter) fileOff(file *token.File) uint64 {
-	off, ok := p.fileIndex[file]
+// fileIndex returns the index of the token.File.
+func (p *iexporter) fileIndex(file *token.File, pos token.Pos) uint64 {
+	index, ok := p.fileInfo[file]
 	if !ok {
-		off = uint64(p.files.Len())
-		p.fileIndex[file] = off
-
-		p.files.uint64(p.stringOff(file.Name()))
-		p.files.uint64(uint64(file.Size()))
-
-		// Delta-encode the line offsets, omitting the initial zero.
-		// (An empty file has an empty lines array.)
-		//
-		// TODO(adonovan): opt: use a two-pass approach that
-		// first gathers the set of Pos values and then
-		// encodes only the information necessary for them.
-		// This would allow us to discard the lines after the
-		// last object of interest and to run-length encode the
-		// trivial lines between lines with needed positions.
-		lines := getLines(file)
-		p.files.uint64(uint64(len(lines)))
-		for i := 1; i < len(lines); i++ {
-			p.files.uint64(uint64(lines[i] - lines[i-1]))
+		index = uint64(len(p.fileInfo))
+		p.fileInfos = append(p.fileInfos, &filePositions{file: file})
+		if p.fileInfo == nil {
+			p.fileInfo = make(map[*token.File]uint64)
 		}
+		p.fileInfo[file] = index
 	}
-	return off
+	// Record each needed position.
+	info := p.fileInfos[index]
+	info.needed = append(info.needed, pos)
+	return index
 }

 // pushDecl adds n to the declaration work queue, if not already present.
@ -526,7 +551,7 @@ func (w *exportWriter) posV2(pos token.Pos) {
 		return
 	}
 	file := w.p.fset.File(pos) // fset must be non-nil
-	w.uint64(1 + w.p.fileOff(file))
+	w.uint64(1 + w.p.fileIndex(file, pos))
 	w.uint64(uint64(file.Offset(pos)))
 }

--- a/internal/gcimporter/iimport.go
+++ b/internal/gcimporter/iimport.go
@ -138,9 +138,14 @@ func iimportCommon(fset *token.FileSet, imports map[string]*types.Package, data

 	sLen := int64(r.uint64())
 	var fLen int64
+	var fileOffset []uint64
 	if insert != nil {
-		// shallow mode uses a different position encoding
+		// Shallow mode uses a different position encoding.
 		fLen = int64(r.uint64())
+		fileOffset = make([]uint64, r.uint64())
+		for i := range fileOffset {
+			fileOffset[i] = r.uint64()
+		}
 	}
 	dLen := int64(r.uint64())

@ -157,8 +162,9 @@ func iimportCommon(fset *token.FileSet, imports map[string]*types.Package, data

 		stringData:  stringData,
 		stringCache: make(map[uint64]string),
+		fileOffset:  fileOffset,
 		fileData:    fileData,
-		fileCache:   make(map[uint64]*token.File),
+		fileCache:   make([]*token.File, len(fileOffset)),
 		pkgCache:    make(map[uint64]*types.Package),

 		declData: declData,
@ -288,8 +294,9 @@ type iimporter struct {

 	stringData  []byte
 	stringCache map[uint64]string
+	fileOffset  []uint64 // fileOffset[i] is offset in fileData for info about file encoded as i
 	fileData    []byte
-	fileCache   map[uint64]*token.File
+	fileCache   []*token.File // memoized decoding of file encoded as i
 	pkgCache    map[uint64]*types.Package

 	declData    []byte
@ -362,9 +369,10 @@ func (p *iimporter) stringAt(off uint64) string {
 	return s
 }

-func (p *iimporter) fileAt(off uint64) *token.File {
-	file, ok := p.fileCache[off]
-	if !ok {
+func (p *iimporter) fileAt(index uint64) *token.File {
+	file := p.fileCache[index]
+	if file == nil {
+		off := p.fileOffset[index]
 		rd := intReader{bytes.NewReader(p.fileData[off:]), p.ipath}
 		filename := p.stringAt(rd.uint64())
 		size := int(rd.uint64())
@ -380,7 +388,7 @@ func (p *iimporter) fileAt(off uint64) *token.File {
 			}
 		}

-		p.fileCache[off] = file
+		p.fileCache[index] = file
 	}
 	return file
 }