package pkg import ( "bufio" "bytes" "context" "crypto/sha512" "encoding/binary" "errors" "fmt" "io" "slices" "strconv" "syscall" "unique" "unsafe" ) // wordSize is the boundary which binary segments are always aligned to. const wordSize = 8 // alignSize returns the padded size for aligning sz. func alignSize(sz int) int { return sz + (wordSize-(sz)%wordSize)%wordSize } // panicToError recovers from a panic and replaces a nil error with the panicked // error value. If the value does not implement error, it is re-panicked. func panicToError(errP *error) { r := recover() if r == nil { return } if err, ok := r.(error); !ok { panic(r) } else if *errP == nil { *errP = err } } // IContext is passed to [Artifact.Params] and provides methods for writing // values to the IR writer. It does not expose the underlying [io.Writer]. // // IContext is valid until [Artifact.Params] returns. type IContext struct { // Address of underlying [Cache], should be zeroed or made unusable after // [Artifact.Params] returns and must not be exposed directly. cache *Cache // Written to by various methods, should be zeroed after [Artifact.Params] // returns and must not be exposed directly. w io.Writer } // Unwrap returns the underlying [context.Context]. func (i *IContext) Unwrap() context.Context { return i.cache.ctx } // irZero is a zero IR word. var irZero [wordSize]byte // IRValueKind denotes the kind of encoded value. type IRValueKind uint32 const ( // IRKindEnd denotes the end of the current parameters stream. The ancillary // value is interpreted as [IREndFlag]. IRKindEnd IRValueKind = iota // IRKindIdent denotes the identifier of a dependency [Artifact]. The // ancillary value is reserved for future use. IRKindIdent // IRKindUint32 denotes an inlined uint32 value. IRKindUint32 // IRKindString denotes a string with its true length encoded in header // ancillary data. Its wire length is always aligned to 8 byte boundary. IRKindString irHeaderShift = 32 irHeaderMask = 0xffffffff ) // String returns a user-facing name of k. func (k IRValueKind) String() string { switch k { case IRKindEnd: return "terminator" case IRKindIdent: return "ident" case IRKindUint32: return "uint32" case IRKindString: return "string" default: return "invalid kind " + strconv.Itoa(int(k)) } } // irValueHeader encodes [IRValueKind] and a 32-bit ancillary value. type irValueHeader uint64 // encodeHeader returns irValueHeader encoding [IRValueKind] and ancillary data. func (k IRValueKind) encodeHeader(v uint32) irValueHeader { return irValueHeader(v)< irMaxStringLength || sz > irMaxStringLength { panic(IRStringError(p)) } i.mustWrite(IRKindString.encodeHeader(uint32(len(p))).append(nil)) i.mustWrite(p) psz := sz - len(p) if psz > 0 { i.mustWrite(irZero[:psz]) } } // WriteString writes s as a string value to the IR. func (i *IContext) WriteString(s string) { p := unsafe.Slice(unsafe.StringData(s), len(s)) i.Write(p) } // Encode writes a deterministic, efficient representation of a to w and returns // the first non-nil error encountered while writing to w. func (c *Cache) Encode(w io.Writer, a Artifact) (err error) { deps := a.Dependencies() idents := make([]*extIdent, len(deps)) for i, d := range deps { dbuf, did := c.unsafeIdent(d, true) if dbuf == nil { dbuf = c.getIdentBuf() binary.LittleEndian.PutUint64(dbuf[:], uint64(d.Kind())) *(*ID)(dbuf[wordSize:]) = did.Value() } else { c.storeIdent(d, dbuf) } defer c.putIdentBuf(dbuf) idents[i] = dbuf } slices.SortFunc(idents, func(a, b *extIdent) int { return bytes.Compare(a[:], b[:]) }) idents = slices.CompactFunc(idents, func(a, b *extIdent) bool { return *a == *b }) // kind uint64 | deps_sz uint64 var buf [wordSize * 2]byte binary.LittleEndian.PutUint64(buf[:], uint64(a.Kind())) binary.LittleEndian.PutUint64(buf[wordSize:], uint64(len(idents))) if _, err = w.Write(buf[:]); err != nil { return } for _, dn := range idents { // kind uint64 | ident ID if _, err = w.Write(dn[:]); err != nil { return } } func() { i := IContext{c, w} defer panicToError(&err) defer func() { i.cache, i.w = nil, nil }() a.Params(&i) }() if err != nil { return } var f IREndFlag kcBuf := c.getIdentBuf() sz := wordSize if kc, ok := a.(KnownChecksum); ok { f |= IREndKnownChecksum *(*Checksum)(kcBuf[wordSize:]) = kc.Checksum() sz += len(Checksum{}) } IRKindEnd.encodeHeader(uint32(f)).put(kcBuf[:]) _, err = w.Write(kcBuf[:sz]) c.putIdentBuf(kcBuf) return } // encodeAll implements EncodeAll by recursively encoding dependencies and // performs deduplication by value via the encoded map. func (c *Cache) encodeAll( w io.Writer, a Artifact, encoded map[Artifact]struct{}, ) (err error) { if _, ok := encoded[a]; ok { return } for _, d := range a.Dependencies() { if err = c.encodeAll(w, d, encoded); err != nil { return } } encoded[a] = struct{}{} return c.Encode(w, a) } // EncodeAll writes a self-describing IR stream of a to w and returns the first // non-nil error encountered while writing to w. // // EncodeAll tries to avoid encoding the same [Artifact] more than once, however // it will fail to do so if they do not compare equal by value, as that will // require buffering and greatly reduce performance. It is therefore up to the // caller to avoid causing dependencies to be represented in a way such that // two equivalent artifacts do not compare equal. While an IR stream with // repeated artifacts is valid, it is somewhat inefficient, and the reference // [IRDecoder] implementation produces a warning for it. // // Note that while EncodeAll makes use of the ident free list, it does not use // the ident cache, nor does it contribute identifiers it computes back to the // ident cache. Because of this, multiple invocations of EncodeAll will have // similar cost and does not amortise when combined with a call to Cure. func (c *Cache) EncodeAll(w io.Writer, a Artifact) error { return c.encodeAll(w, a, make(map[Artifact]struct{})) } // ErrRemainingIR is returned for a [IRReadFunc] that failed to call // [IRReader.Finalise] before returning. var ErrRemainingIR = errors.New("implementation did not consume final value") // DanglingIdentError is an identifier in a [IRKindIdent] value that was never // described in the IR stream before it was encountered. type DanglingIdentError unique.Handle[ID] func (e DanglingIdentError) Error() string { return "artifact " + Encode(unique.Handle[ID](e).Value()) + " was never described" } type ( // IRDecoder decodes [Artifact] from an IR stream. The stream is read to // EOF and the final [Artifact] is returned. Previous artifacts may be // looked up by their identifier. // // An [Artifact] may appear more than once in the same IR stream. A // repeating [Artifact] generates a warning via [Cache] and will appear if // verbose logging is enabled. Artifacts may only depend on artifacts // previously described in the IR stream. // // Methods of IRDecoder are not safe for concurrent use. IRDecoder struct { // Address of underlying [Cache], must not be exposed directly. c *Cache // Underlying IR reader. Methods of [IRReader] must not use this as it // bypasses ident measurement. r io.Reader // Artifacts already seen in the IR stream. ident map[unique.Handle[ID]]Artifact // Whether Decode returned, and the entire IR stream was decoded. done, ok bool } // IRReader provides methods to decode the IR wire format and read values // from the reader embedded in the underlying [IRDecoder]. It is // deliberately impossible to obtain the [IRValueKind] of the next value, // and callers must never recover from panics in any read method. // // It is the responsibility of the caller to call Finalise after all IR // values have been read. Failure to call Finalise causes the resulting // [Artifact] to be rejected with [ErrRemainingIR]. // // For an [Artifact] expected to have dependencies, the caller must consume // all dependencies by calling Next until all dependencies are depleted, or // call DiscardAll to explicitly discard them and rely on values encoded as // [IRKindIdent] instead. Failure to consume all unstructured dependencies // causes the resulting [Artifact] to be rejected with [MissedDependencyError]. // // Requesting the value of an unstructured dependency not yet described in // the IR stream via Next, or reading an [IRKindIdent] value not part of // unstructured dependencies via ReadIdent may cause the resulting // [Artifact] to be rejected with [DanglingIdentError], however either // method may return a non-nil [Artifact] implementation of unspecified // value. IRReader struct { // Address of underlying [IRDecoder], should be zeroed or made unusable // after finalisation and must not be exposed directly. d *IRDecoder // Common buffer for word-sized reads. buf [wordSize]byte // Dependencies sent before params, sorted by identifier. Resliced on // each call to Next and checked to be depleted during Finalise. deps []*extIdent // Number of values already read, -1 denotes a finalised IRReader. count int // Header of value currently being read. h irValueHeader // Measured IR reader. All reads for the current [Artifact] must go // through this to produce a correct ident. r io.Reader // Buffers measure writes. Flushed and returned to d during Finalise. ibw *bufio.Writer } // IRReadFunc reads IR values written by [Artifact.Params] to produce an // instance of [Artifact] identical to the one to produce these values. IRReadFunc func(r *IRReader) Artifact ) // kind returns the [IRValueKind] encoded in h. func (h irValueHeader) kind() IRValueKind { return IRValueKind(h & irHeaderMask) } // value returns ancillary data encoded in h. func (h irValueHeader) value() uint32 { return uint32(h >> irHeaderShift) } // irArtifact refers to artifact IR interpretation functions and must not be // written to directly. var irArtifact = make(map[Kind]IRReadFunc) // InvalidKindError is an unregistered [Kind] value. type InvalidKindError Kind func (e InvalidKindError) Error() string { return "invalid artifact kind " + strconv.Itoa(int(e)) } // register records the [IRReadFunc] of an implementation of [Artifact] under // the specified [Kind]. Expecting to be used only during initialization, it // panics if the mapping between [Kind] and [IRReadFunc] is not a bijection. // // register is not safe for concurrent use. register must not be called after // the first instance of [Cache] has been opened. func register(k Kind, f IRReadFunc) { if _, ok := irArtifact[k]; ok { panic("attempting to register " + strconv.Itoa(int(k)) + " twice") } irArtifact[k] = f } // Register records the [IRReadFunc] of a custom implementation of [Artifact] // under the specified [Kind]. Expecting to be used only during initialization, // it panics if the mapping between [Kind] and [IRReadFunc] is not a bijection, // or the specified [Kind] is below [KindCustomOffset]. // // Register is not safe for concurrent use. Register must not be called after // the first instance of [Cache] has been opened. func Register(k Kind, f IRReadFunc) { if k < KindCustomOffset { panic("attempting to register within internal kind range") } register(k, f) } // NewDecoder returns a new [IRDecoder] that reads from the [io.Reader]. func (c *Cache) NewDecoder(r io.Reader) *IRDecoder { return &IRDecoder{c, r, make(map[unique.Handle[ID]]Artifact), false, false} } const ( // irMaxValues is the arbitrary maximum number of values allowed to be // written by [Artifact.Params] and subsequently read via [IRReader]. irMaxValues = 1 << 12 // irMaxDeps is the arbitrary maximum number of direct dependencies allowed // to be returned by [Artifact.Dependencies] and subsequently decoded by // [IRDecoder]. irMaxDeps = 1 << 10 ) var ( // ErrIRValues is returned for an [Artifact] with too many parameter values. ErrIRValues = errors.New("artifact has too many IR parameter values") // ErrIRDepend is returned for an [Artifact] with too many dependencies. ErrIRDepend = errors.New("artifact has too many dependencies") // ErrAlreadyFinalised is returned when attempting to use an [IRReader] that // has already been finalised. ErrAlreadyFinalised = errors.New("reader has already finalised") ) // enterReader panics with an appropriate error for an out-of-bounds count and // must be called at some point in any exported method. func (ir *IRReader) enterReader(read bool) { if ir.count < 0 { panic(ErrAlreadyFinalised) } if ir.count >= irMaxValues { panic(ErrIRValues) } if read { ir.count++ } } // IRKindError describes an attempt to read an IR value of unexpected kind. type IRKindError struct { Got, Want IRValueKind Ancillary uint32 } func (e *IRKindError) Error() string { return fmt.Sprintf( "got %s IR value (%#x) instead of %s", e.Got, e.Ancillary, e.Want, ) } // readFull reads until either p is filled or an error is encountered. func (ir *IRReader) readFull(p []byte) (n int, err error) { for n < len(p) && err == nil { var nn int nn, err = ir.r.Read(p[n:]) n += nn } return } // mustRead reads from the underlying measured reader and panics on error. If // an [io.EOF] is encountered and n != len(p), the error is promoted to a // [io.ErrUnexpectedEOF], if n == 0, [io.EOF] is kept as is, otherwise it is // zeroed. func (ir *IRReader) mustRead(p []byte) { n, err := ir.readFull(p) if err == nil { return } if errors.Is(err, io.EOF) { if n == len(p) { return } err = io.ErrUnexpectedEOF } panic(err) } // mustReadHeader reads the next header via d and checks its kind. func (ir *IRReader) mustReadHeader(k IRValueKind) { ir.mustRead(ir.buf[:]) ir.h = irValueHeader(binary.LittleEndian.Uint64(ir.buf[:])) if wk := ir.h.kind(); wk != k { panic(&IRKindError{wk, k, ir.h.value()}) } } // putAll returns all dependency buffers to the underlying [Cache]. func (ir *IRReader) putAll() { for _, buf := range ir.deps { ir.d.c.putIdentBuf(buf) } ir.deps = nil } // DiscardAll discards all unstructured dependencies. This is useful to // implementations that encode dependencies as [IRKindIdent] which are read back // via ReadIdent. func (ir *IRReader) DiscardAll() { if ir.deps == nil { panic("attempting to discard dependencies twice") } ir.putAll() } // ErrDependencyDepleted is returned when attempting to advance to the next // unstructured dependency when there are none left. var ErrDependencyDepleted = errors.New("reading past end of dependencies") // Next returns the next unstructured dependency. func (ir *IRReader) Next() Artifact { if len(ir.deps) == 0 { panic(ErrDependencyDepleted) } id := unique.Make(ID(ir.deps[0][wordSize:])) ir.d.c.putIdentBuf(ir.deps[0]) ir.deps = ir.deps[1:] if a, ok := ir.d.ident[id]; !ok { ir.putAll() panic(DanglingIdentError(id)) } else { return a } } // MissedDependencyError is the number of unstructured dependencies remaining // in [IRReader] that was never requested or explicitly discarded before // finalisation. type MissedDependencyError int func (e MissedDependencyError) Error() string { return "missed " + strconv.Itoa(int(e)) + " unstructured dependencies" } var ( // ErrUnexpectedChecksum is returned by a [IRReadFunc] that does not expect // a checksum but received one in [IRKindEnd] anyway. ErrUnexpectedChecksum = errors.New("checksum specified on unsupported artifact") // ErrExpectedChecksum is returned by a [IRReadFunc] that expects a checksum // but did not receive one in [IRKindEnd]. ErrExpectedChecksum = errors.New("checksum required but not specified") ) // Finalise reads the final [IRKindEnd] value and marks r as finalised. Methods // of r are invalid upon entry into Finalise. If a [Checksum] is available via // [IREndKnownChecksum], its handle is returned and the caller must store its // value in the resulting [Artifact]. func (ir *IRReader) Finalise() (checksum unique.Handle[Checksum], ok bool) { ir.enterReader(true) ir.count = -1 ir.mustReadHeader(IRKindEnd) f := IREndFlag(ir.h.value()) if f&IREndKnownChecksum != 0 { buf := ir.d.c.getIdentBuf() defer ir.d.c.putIdentBuf(buf) ir.mustRead(buf[wordSize:]) checksum = unique.Make(Checksum(buf[wordSize:])) ok = true } if err := ir.ibw.Flush(); err != nil { panic(err) } ir.r, ir.ibw = nil, nil if len(ir.deps) != 0 { panic(MissedDependencyError(len(ir.deps))) } return } // ReadIdent reads the next value as [IRKindIdent]. func (ir *IRReader) ReadIdent() Artifact { ir.enterReader(true) ir.mustReadHeader(IRKindIdent) buf := ir.d.c.getIdentBuf() defer ir.d.c.putIdentBuf(buf) ir.mustRead(buf[wordSize:]) id := unique.Make(ID(buf[wordSize:])) if a, ok := ir.d.ident[id]; !ok { panic(DanglingIdentError(id)) } else { return a } } // ReadUint32 reads the next value as [IRKindUint32]. func (ir *IRReader) ReadUint32() uint32 { ir.enterReader(true) ir.mustReadHeader(IRKindUint32) return ir.h.value() } // ReadStringBytes reads the next value as [IRKindString] but returns it as a // byte slice instead. func (ir *IRReader) ReadStringBytes() []byte { ir.enterReader(true) ir.mustReadHeader(IRKindString) sz := int(ir.h.value()) szWire := alignSize(sz) if szWire > irMaxStringLength { panic(IRStringError("\x00")) } p := make([]byte, szWire) ir.mustRead(p) return p[:sz] } // ReadString reads the next value as [IRKindString]. func (ir *IRReader) ReadString() string { p := ir.ReadStringBytes() return unsafe.String(unsafe.SliceData(p), len(p)) } // decode decodes the next [Artifact] in the IR stream and returns any buffer // originating from [Cache] before returning. decode returns [io.EOF] if and // only if the underlying [io.Reader] is already read to EOF. func (d *IRDecoder) decode() (a Artifact, err error) { defer panicToError(&err) var ir IRReader defer func() { ir.d = nil }() ir.d = d h := sha512.New384() ir.ibw = d.c.getWriter(h) defer d.c.putWriter(ir.ibw) ir.r = io.TeeReader(d.r, ir.ibw) if n, _err := ir.readFull(ir.buf[:]); _err != nil { if errors.Is(_err, io.EOF) { if n != 0 { _err = io.ErrUnexpectedEOF } } err = _err return } ak := Kind(binary.LittleEndian.Uint64(ir.buf[:])) f, ok := irArtifact[ak] if !ok { err = InvalidKindError(ak) return } defer ir.putAll() ir.mustRead(ir.buf[:]) sz := binary.LittleEndian.Uint64(ir.buf[:]) if sz > irMaxDeps { err = ErrIRDepend return } ir.deps = make([]*extIdent, sz) for i := range ir.deps { ir.deps[i] = d.c.getIdentBuf() } for _, buf := range ir.deps { ir.mustRead(buf[:]) } a = f(&ir) if a == nil { err = syscall.ENOTRECOVERABLE return } if ir.count != -1 { err = ErrRemainingIR return } buf := d.c.getIdentBuf() h.Sum(buf[wordSize:wordSize]) id := unique.Make(ID(buf[wordSize:])) d.c.putIdentBuf(buf) if _, ok = d.ident[id]; !ok { d.ident[id] = a } else { d.c.msg.Verbosef( "artifact %s appeared more than once in IR stream", Encode(id.Value()), ) } return } // Decode consumes the IR stream to EOF and returns the final [Artifact]. After // Decode returns, Lookup is available and Decode must not be called again. func (d *IRDecoder) Decode() (a Artifact, err error) { if d.done { panic("attempting to decode an IR stream twice") } defer func() { d.done = true }() var cur Artifact next: a, err = d.decode() if err == nil { cur = a goto next } if errors.Is(err, io.EOF) { a, err = cur, nil d.ok = true } return } // Lookup looks up an [Artifact] described by the IR stream by its identifier. func (d *IRDecoder) Lookup(id unique.Handle[ID]) (a Artifact, ok bool) { if !d.ok { panic("attempting to look up artifact without full IR stream") } a, ok = d.ident[id] return }