diff --git a/changelogs/unreleased/284-remove-zip-buffer.yml b/changelogs/unreleased/284-remove-zip-buffer.yml new file mode 100644 index 0000000000000000000000000000000000000000..05f363fcca5676dd854507a8411f0162dcee1cbd --- /dev/null +++ b/changelogs/unreleased/284-remove-zip-buffer.yml @@ -0,0 +1,5 @@ +--- +title: Remove an in-memory buffer for LSIF transformation +merge_request: 586 +author: +type: performance diff --git a/internal/lsif_transformer/parser/docs.go b/internal/lsif_transformer/parser/docs.go index 0f587832deac998773174866071d956643878b59..3b293a1eecb5373603482f89b9e8710b5b4477cf 100644 --- a/internal/lsif_transformer/parser/docs.go +++ b/internal/lsif_transformer/parser/docs.go @@ -2,7 +2,9 @@ package parser import ( "archive/zip" + "bufio" "encoding/json" + "io" "strings" ) @@ -45,7 +47,19 @@ func NewDocs(config Config) (*Docs, error) { }, nil } -func (d *Docs) Read(line []byte) error { +func (d *Docs) Parse(r io.Reader) error { + scanner := bufio.NewScanner(r) + + for scanner.Scan() { + if err := d.process(scanner.Bytes()); err != nil { + return err + } + } + + return scanner.Err() +} + +func (d *Docs) process(line []byte) error { l := Line{} if err := json.Unmarshal(line, &l); err != nil { return err diff --git a/internal/lsif_transformer/parser/docs_test.go b/internal/lsif_transformer/parser/docs_test.go index ff19dd561bb24e25e9206bb3955a93f4cf3dcf47..1d0154c40ffba8967edaf83ae958e8fa29008b30 100644 --- a/internal/lsif_transformer/parser/docs_test.go +++ b/internal/lsif_transformer/parser/docs_test.go @@ -1,6 +1,7 @@ package parser import ( + "bytes" "fmt" "testing" @@ -8,34 +9,34 @@ import ( ) func createLine(id, label, uri string) []byte { - return []byte(fmt.Sprintf(`{"id":"%s","label":"%s","uri":"%s"}`, id, label, uri)) + return []byte(fmt.Sprintf(`{"id":"%s","label":"%s","uri":"%s"}`+"\n", id, label, uri)) } -func TestRead(t *testing.T) { +func TestParse(t *testing.T) { d, err := NewDocs(Config{}) require.NoError(t, err) defer d.Close() - metadataLine := []byte(`{"id":"1","label":"metaData","projectRoot":"file:///Users/nested"}`) + data := []byte(`{"id":"1","label":"metaData","projectRoot":"file:///Users/nested"}` + "\n") + data = append(data, createLine("2", "document", "file:///Users/nested/file.rb")...) + data = append(data, createLine("3", "document", "file:///Users/nested/folder/file.rb")...) + data = append(data, createLine("4", "document", "file:///Users/wrong/file.rb")...) - require.NoError(t, d.Read(metadataLine)) - require.NoError(t, d.Read(createLine("2", "document", "file:///Users/nested/file.rb"))) - require.NoError(t, d.Read(createLine("3", "document", "file:///Users/nested/folder/file.rb"))) - require.NoError(t, d.Read(createLine("4", "document", "file:///Users/wrong/file.rb"))) + require.NoError(t, d.Parse(bytes.NewReader(data))) require.Equal(t, d.Entries[2], "file.rb") require.Equal(t, d.Entries[3], "folder/file.rb") require.Equal(t, d.Entries[4], "file:///Users/wrong/file.rb") } -func TestReadContainsLine(t *testing.T) { +func TestParseContainsLine(t *testing.T) { d, err := NewDocs(Config{}) require.NoError(t, err) defer d.Close() - line := []byte(`{"id":"5","label":"contains","outV":"1", "inVs": ["2", "3"]}`) + line := []byte(`{"id":"5","label":"contains","outV":"1", "inVs": ["2", "3"]}` + "\n") - require.NoError(t, d.Read(line)) + require.NoError(t, d.Parse(bytes.NewReader(line))) require.Equal(t, []Id{2, 3}, d.DocRanges[1]) } diff --git a/internal/lsif_transformer/parser/parser.go b/internal/lsif_transformer/parser/parser.go index 8687e1243d78b6990548c656a0f2174cf3e77708..44c1e8fe835e1a23f577e466222d3dc7e2f5d6e4 100644 --- a/internal/lsif_transformer/parser/parser.go +++ b/internal/lsif_transformer/parser/parser.go @@ -2,9 +2,8 @@ package parser import ( "archive/zip" - "bufio" - "bytes" "errors" + "fmt" "io" "io/ioutil" "os" @@ -16,84 +15,91 @@ var ( type Parser struct { Docs *Docs + + pr *io.PipeReader } type Config struct { TempPath string } -func NewParser(r io.Reader, config Config) (*Parser, error) { +func NewParser(r io.Reader, config Config) (io.ReadCloser, error) { docs, err := NewDocs(config) if err != nil { return nil, err } - zr, err := openZipReader(r, config.TempPath) + // ZIP files need to be seekable. Don't hold it all in RAM, use a tempfile + tempFile, err := ioutil.TempFile(config.TempPath, Lsif) if err != nil { return nil, err } - reader := bufio.NewReader(zr) - - for { - line, err := reader.ReadBytes('\n') - if err != nil { - break - } - - if err := docs.Read(line); err != nil { - return nil, err - } - } - - return &Parser{Docs: docs}, nil -} -func (p *Parser) ZipReader() (io.Reader, error) { - buf := new(bytes.Buffer) - w := zip.NewWriter(buf) + defer tempFile.Close() - if err := p.Docs.SerializeEntries(w); err != nil { + if err := os.Remove(tempFile.Name()); err != nil { return nil, err } - if err := w.Close(); err != nil { + size, err := io.Copy(tempFile, r) + if err != nil { return nil, err } - return buf, nil -} - -func (p *Parser) Close() error { - return p.Docs.Close() -} - -func openZipReader(reader io.Reader, tempDir string) (io.Reader, error) { - tempFile, err := ioutil.TempFile(tempDir, Lsif) + zr, err := zip.NewReader(tempFile, size) if err != nil { return nil, err } - if err := os.Remove(tempFile.Name()); err != nil { - return nil, err + if len(zr.File) == 0 { + return nil, errors.New("empty zip file") } - size, err := io.Copy(tempFile, reader) + file, err := zr.File[0].Open() if err != nil { return nil, err } - if _, err := tempFile.Seek(0, io.SeekStart); err != nil { + defer file.Close() + + if err := docs.Parse(file); err != nil { return nil, err } - zr, err := zip.NewReader(tempFile, size) - if err != nil { - return nil, err + pr, pw := io.Pipe() + parser := &Parser{ + Docs: docs, + pr: pr, } - if len(zr.File) == 0 { - return nil, errors.New("empty zip file") + go parser.transform(pw) + + return parser, nil +} + +func (p *Parser) Read(b []byte) (int, error) { + return p.pr.Read(b) +} + +func (p *Parser) Close() error { + p.pr.Close() + + return p.Docs.Close() +} + +func (p *Parser) transform(pw *io.PipeWriter) { + zw := zip.NewWriter(pw) + + if err := p.Docs.SerializeEntries(zw); err != nil { + zw.Close() // Free underlying resources only + pw.CloseWithError(fmt.Errorf("lsif parser: Docs.SerializeEntries: %v", err)) + return + } + + if err := zw.Close(); err != nil { + pw.CloseWithError(fmt.Errorf("lsif parser: ZipWriter.Close: %v", err)) + return } - return zr.File[0].Open() + pw.Close() } diff --git a/internal/lsif_transformer/parser/parser_test.go b/internal/lsif_transformer/parser/parser_test.go index 5bc34d69f32ac8ca943569c95981793380118144..314df02b1d10df5d72f81b67fae4d5eda8e98e01 100644 --- a/internal/lsif_transformer/parser/parser_test.go +++ b/internal/lsif_transformer/parser/parser_test.go @@ -38,24 +38,21 @@ func verifyCorrectnessOf(t *testing.T, tmpDir, fileName string) { } func createFiles(t *testing.T, filePath, tmpDir string) { + t.Helper() file, err := os.Open(filePath) require.NoError(t, err) - p, err := NewParser(file, Config{}) + parser, err := NewParser(file, Config{}) require.NoError(t, err) - r, err := p.ZipReader() - require.NoError(t, err) - - require.NoError(t, p.Close()) - zipFileName := tmpDir + ".zip" w, err := os.Create(zipFileName) require.NoError(t, err) defer os.RemoveAll(zipFileName) - _, err = io.Copy(w, r) + _, err = io.Copy(w, parser) require.NoError(t, err) + require.NoError(t, parser.Close()) extractZipFiles(t, tmpDir, zipFileName) } diff --git a/internal/lsif_transformer/parser/performance_test.go b/internal/lsif_transformer/parser/performance_test.go index de79e621ba43347ebd6a0000949ba19404e9d07d..d3ecc1ed5c12da241b887a644ecf4f2b1775e740 100644 --- a/internal/lsif_transformer/parser/performance_test.go +++ b/internal/lsif_transformer/parser/performance_test.go @@ -1,6 +1,8 @@ package parser import ( + "io" + "io/ioutil" "os" "runtime" "testing" @@ -19,12 +21,12 @@ func BenchmarkGenerate(b *testing.B) { file, err := os.Open(filePath) require.NoError(b, err) - p, err := NewParser(file, Config{}) + parser, err := NewParser(file, Config{}) require.NoError(b, err) - _, err = p.ZipReader() + _, err = io.Copy(ioutil.Discard, parser) require.NoError(b, err) - require.NoError(b, p.Close()) + require.NoError(b, parser.Close()) }) } diff --git a/internal/upload/rewrite.go b/internal/upload/rewrite.go index 938ecef164f50a57d1dcb452c8b15f69aea90986..80928d64cffa3dc5284e00e8ff6ea4937bdcad62 100644 --- a/internal/upload/rewrite.go +++ b/internal/upload/rewrite.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "io/ioutil" "mime/multipart" "net/http" "strings" @@ -124,7 +125,7 @@ func (rew *rewriter) handleFilePart(ctx context.Context, name string, p *multipa opts.TempFilePrefix = filename - var inputReader io.Reader + var inputReader io.ReadCloser var err error switch { case exif.IsExifFile(filename): @@ -138,9 +139,11 @@ func (rew *rewriter) handleFilePart(ctx context.Context, name string, p *multipa return err } default: - inputReader = p + inputReader = ioutil.NopCloser(p) } + defer inputReader.Close() + fh, err := filestore.SaveFileFromReader(ctx, inputReader, -1, opts) if err != nil { switch err { @@ -166,36 +169,25 @@ func (rew *rewriter) handleFilePart(ctx context.Context, name string, p *multipa return rew.filter.ProcessFile(ctx, name, fh, rew.writer) } -func handleExifUpload(ctx context.Context, r io.Reader, filename string) (io.Reader, error) { +func handleExifUpload(ctx context.Context, r io.Reader, filename string) (io.ReadCloser, error) { log.WithContextFields(ctx, log.Fields{ "filename": filename, }).Print("running exiftool to remove any metadata") - return exif.NewCleaner(ctx, r) -} - -func handleLsifUpload(ctx context.Context, reader io.Reader, tempPath, filename string, preauth *api.Response) (io.Reader, error) { - parserConfig := parser.Config{ - TempPath: tempPath, - } - - p, err := parser.NewParser(reader, parserConfig) + r, err := exif.NewCleaner(ctx, r) if err != nil { return nil, err } - z, err := p.ZipReader() - if err != nil { - return nil, err - } + return ioutil.NopCloser(r), nil +} - if err := p.Close(); err != nil { - log.WithContextFields(ctx, log.Fields{ - "filename": filename, - }).Print("failed to close lsif parser: " + err.Error()) +func handleLsifUpload(ctx context.Context, reader io.Reader, tempPath, filename string, preauth *api.Response) (io.ReadCloser, error) { + parserConfig := parser.Config{ + TempPath: tempPath, } - return z, nil + return parser.NewParser(reader, parserConfig) } func (rew *rewriter) copyPart(ctx context.Context, name string, p *multipart.Part) error {