Files
hakurei/internal/pkg/ir.go
Ophestra ffd2f979fb
All checks were successful
Test / Create distribution (push) Successful in 1m0s
Test / Sandbox (push) Successful in 2m43s
Test / Hakurei (push) Successful in 3m50s
Test / ShareFS (push) Successful in 4m1s
Test / Hpkg (push) Successful in 4m44s
Test / Sandbox (race detector) (push) Successful in 5m9s
Test / Hakurei (race detector) (push) Successful in 5m55s
Test / Flake checks (push) Successful in 1m44s
internal/pkg: skip duplicate early
This significantly increases IR generation performance.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-02-07 17:11:41 +09:00

763 lines
21 KiB
Go

package pkg
import (
"bufio"
"bytes"
"context"
"crypto/sha512"
"encoding/binary"
"errors"
"fmt"
"io"
"slices"
"strconv"
"syscall"
"unique"
"unsafe"
)
// wordSize is the boundary which binary segments are always aligned to.
const wordSize = 8
// alignSize returns the padded size for aligning sz.
func alignSize(sz int) int {
return sz + (wordSize-(sz)%wordSize)%wordSize
}
// panicToError recovers from a panic and replaces a nil error with the panicked
// error value. If the value does not implement error, it is re-panicked.
func panicToError(errP *error) {
r := recover()
if r == nil {
return
}
if err, ok := r.(error); !ok {
panic(r)
} else if *errP == nil {
*errP = err
}
}
// IContext is passed to [Artifact.Params] and provides methods for writing
// values to the IR writer. It does not expose the underlying [io.Writer].
//
// IContext is valid until [Artifact.Params] returns.
type IContext struct {
// Address of underlying [Cache], should be zeroed or made unusable after
// [Artifact.Params] returns and must not be exposed directly.
cache *Cache
// Written to by various methods, should be zeroed after [Artifact.Params]
// returns and must not be exposed directly.
w io.Writer
}
// Unwrap returns the underlying [context.Context].
func (i *IContext) Unwrap() context.Context { return i.cache.ctx }
// irZero is a zero IR word.
var irZero [wordSize]byte
// IRValueKind denotes the kind of encoded value.
type IRValueKind uint32
const (
// IRKindEnd denotes the end of the current parameters stream. The ancillary
// value is interpreted as [IREndFlag].
IRKindEnd IRValueKind = iota
// IRKindIdent denotes the identifier of a dependency [Artifact]. The
// ancillary value is reserved for future use.
IRKindIdent
// IRKindUint32 denotes an inlined uint32 value.
IRKindUint32
// IRKindString denotes a string with its true length encoded in header
// ancillary data. Its wire length is always aligned to 8 byte boundary.
IRKindString
irHeaderShift = 32
irHeaderMask = 0xffffffff
)
// String returns a user-facing name of k.
func (k IRValueKind) String() string {
switch k {
case IRKindEnd:
return "terminator"
case IRKindIdent:
return "ident"
case IRKindUint32:
return "uint32"
case IRKindString:
return "string"
default:
return "invalid kind " + strconv.Itoa(int(k))
}
}
// irValueHeader encodes [IRValueKind] and a 32-bit ancillary value.
type irValueHeader uint64
// encodeHeader returns irValueHeader encoding [IRValueKind] and ancillary data.
func (k IRValueKind) encodeHeader(v uint32) irValueHeader {
return irValueHeader(v)<<irHeaderShift | irValueHeader(k)
}
// put stores h in b[0:8].
func (h irValueHeader) put(b []byte) {
binary.LittleEndian.PutUint64(b[:], uint64(h))
}
// append appends the bytes of h to b and returns the appended slice.
func (h irValueHeader) append(b []byte) []byte {
return binary.LittleEndian.AppendUint64(b, uint64(h))
}
// IREndFlag is ancillary data encoded in the header of an [IRKindEnd] value and
// specifies the presence of optional fields in the remaining [IRKindEnd] data.
// Order of present fields is the order of their corresponding constants defined
// below.
type IREndFlag uint32
const (
// IREndKnownChecksum denotes a [KnownChecksum] artifact. For an [IRKindEnd]
// value with this flag set, the remaining data contains the [Checksum].
IREndKnownChecksum IREndFlag = 1 << iota
)
// mustWrite writes to IContext.w and panics on error. The panic is recovered
// from by the caller and used as the return value.
func (i *IContext) mustWrite(p []byte) {
if _, err := i.w.Write(p); err != nil {
panic(err)
}
}
// WriteIdent writes the identifier of [Artifact] to the IR. The behaviour of
// WriteIdent is not defined for an [Artifact] not part of the slice returned by
// [Artifact.Dependencies].
func (i *IContext) WriteIdent(a Artifact) {
buf := i.cache.getIdentBuf()
defer i.cache.putIdentBuf(buf)
IRKindIdent.encodeHeader(0).put(buf[:])
*(*ID)(buf[wordSize:]) = i.cache.Ident(a).Value()
i.mustWrite(buf[:])
}
// WriteUint32 writes a uint32 value to the IR.
func (i *IContext) WriteUint32(v uint32) {
i.mustWrite(IRKindUint32.encodeHeader(v).append(nil))
}
// irMaxStringLength is the maximum acceptable wire size of [IRKindString].
const irMaxStringLength = 1 << 20
// IRStringError is a string value too big to encode in IR.
type IRStringError string
func (IRStringError) Error() string {
return "params value too big to encode in IR"
}
// Write writes p as a string value to the IR.
func (i *IContext) Write(p []byte) {
sz := alignSize(len(p))
if len(p) > irMaxStringLength || sz > irMaxStringLength {
panic(IRStringError(p))
}
i.mustWrite(IRKindString.encodeHeader(uint32(len(p))).append(nil))
i.mustWrite(p)
psz := sz - len(p)
if psz > 0 {
i.mustWrite(irZero[:psz])
}
}
// WriteString writes s as a string value to the IR.
func (i *IContext) WriteString(s string) {
p := unsafe.Slice(unsafe.StringData(s), len(s))
i.Write(p)
}
// Encode writes a deterministic, efficient representation of a to w and returns
// the first non-nil error encountered while writing to w.
func (c *Cache) Encode(w io.Writer, a Artifact) (err error) {
deps := a.Dependencies()
idents := make([]*extIdent, len(deps))
for i, d := range deps {
dbuf, did := c.unsafeIdent(d, true)
if dbuf == nil {
dbuf = c.getIdentBuf()
binary.LittleEndian.PutUint64(dbuf[:], uint64(d.Kind()))
*(*ID)(dbuf[wordSize:]) = did.Value()
} else {
c.storeIdent(d, dbuf)
}
defer c.putIdentBuf(dbuf)
idents[i] = dbuf
}
slices.SortFunc(idents, func(a, b *extIdent) int {
return bytes.Compare(a[:], b[:])
})
idents = slices.CompactFunc(idents, func(a, b *extIdent) bool {
return *a == *b
})
// kind uint64 | deps_sz uint64
var buf [wordSize * 2]byte
binary.LittleEndian.PutUint64(buf[:], uint64(a.Kind()))
binary.LittleEndian.PutUint64(buf[wordSize:], uint64(len(idents)))
if _, err = w.Write(buf[:]); err != nil {
return
}
for _, dn := range idents {
// kind uint64 | ident ID
if _, err = w.Write(dn[:]); err != nil {
return
}
}
func() {
i := IContext{c, w}
defer panicToError(&err)
defer func() { i.cache, i.w = nil, nil }()
a.Params(&i)
}()
if err != nil {
return
}
var f IREndFlag
kcBuf := c.getIdentBuf()
sz := wordSize
if kc, ok := a.(KnownChecksum); ok {
f |= IREndKnownChecksum
*(*Checksum)(kcBuf[wordSize:]) = kc.Checksum()
sz += len(Checksum{})
}
IRKindEnd.encodeHeader(uint32(f)).put(kcBuf[:])
_, err = w.Write(kcBuf[:sz])
c.putIdentBuf(kcBuf)
return
}
// encodeAll implements EncodeAll by recursively encoding dependencies and
// performs deduplication by value via the encoded map.
func (c *Cache) encodeAll(
w io.Writer,
a Artifact,
encoded map[Artifact]struct{},
) (err error) {
if _, ok := encoded[a]; ok {
return
}
for _, d := range a.Dependencies() {
if err = c.encodeAll(w, d, encoded); err != nil {
return
}
}
encoded[a] = struct{}{}
return c.Encode(w, a)
}
// EncodeAll writes a self-describing IR stream of a to w and returns the first
// non-nil error encountered while writing to w.
//
// EncodeAll tries to avoid encoding the same [Artifact] more than once, however
// it will fail to do so if they do not compare equal by value, as that will
// require buffering and greatly reduce performance. It is therefore up to the
// caller to avoid causing dependencies to be represented in a way such that
// two equivalent artifacts do not compare equal. While an IR stream with
// repeated artifacts is valid, it is somewhat inefficient, and the reference
// [IRDecoder] implementation produces a warning for it.
//
// Note that while EncodeAll makes use of the ident free list, it does not use
// the ident cache, nor does it contribute identifiers it computes back to the
// ident cache. Because of this, multiple invocations of EncodeAll will have
// similar cost and does not amortise when combined with a call to Cure.
func (c *Cache) EncodeAll(w io.Writer, a Artifact) error {
return c.encodeAll(w, a, make(map[Artifact]struct{}))
}
// ErrRemainingIR is returned for a [IRReadFunc] that failed to call
// [IRReader.Finalise] before returning.
var ErrRemainingIR = errors.New("implementation did not consume final value")
// DanglingIdentError is an identifier in a [IRKindIdent] value that was never
// described in the IR stream before it was encountered.
type DanglingIdentError unique.Handle[ID]
func (e DanglingIdentError) Error() string {
return "artifact " + Encode(unique.Handle[ID](e).Value()) +
" was never described"
}
type (
// IRDecoder decodes [Artifact] from an IR stream. The stream is read to
// EOF and the final [Artifact] is returned. Previous artifacts may be
// looked up by their identifier.
//
// An [Artifact] may appear more than once in the same IR stream. A
// repeating [Artifact] generates a warning via [Cache] and will appear if
// verbose logging is enabled. Artifacts may only depend on artifacts
// previously described in the IR stream.
//
// Methods of IRDecoder are not safe for concurrent use.
IRDecoder struct {
// Address of underlying [Cache], must not be exposed directly.
c *Cache
// Underlying IR reader. Methods of [IRReader] must not use this as it
// bypasses ident measurement.
r io.Reader
// Artifacts already seen in the IR stream.
ident map[unique.Handle[ID]]Artifact
// Whether Decode returned, and the entire IR stream was decoded.
done, ok bool
}
// IRReader provides methods to decode the IR wire format and read values
// from the reader embedded in the underlying [IRDecoder]. It is
// deliberately impossible to obtain the [IRValueKind] of the next value,
// and callers must never recover from panics in any read method.
//
// It is the responsibility of the caller to call Finalise after all IR
// values have been read. Failure to call Finalise causes the resulting
// [Artifact] to be rejected with [ErrRemainingIR].
//
// For an [Artifact] expected to have dependencies, the caller must consume
// all dependencies by calling Next until all dependencies are depleted, or
// call DiscardAll to explicitly discard them and rely on values encoded as
// [IRKindIdent] instead. Failure to consume all unstructured dependencies
// causes the resulting [Artifact] to be rejected with [MissedDependencyError].
//
// Requesting the value of an unstructured dependency not yet described in
// the IR stream via Next, or reading an [IRKindIdent] value not part of
// unstructured dependencies via ReadIdent may cause the resulting
// [Artifact] to be rejected with [DanglingIdentError], however either
// method may return a non-nil [Artifact] implementation of unspecified
// value.
IRReader struct {
// Address of underlying [IRDecoder], should be zeroed or made unusable
// after finalisation and must not be exposed directly.
d *IRDecoder
// Common buffer for word-sized reads.
buf [wordSize]byte
// Dependencies sent before params, sorted by identifier. Resliced on
// each call to Next and checked to be depleted during Finalise.
deps []*extIdent
// Number of values already read, -1 denotes a finalised IRReader.
count int
// Header of value currently being read.
h irValueHeader
// Measured IR reader. All reads for the current [Artifact] must go
// through this to produce a correct ident.
r io.Reader
// Buffers measure writes. Flushed and returned to d during Finalise.
ibw *bufio.Writer
}
// IRReadFunc reads IR values written by [Artifact.Params] to produce an
// instance of [Artifact] identical to the one to produce these values.
IRReadFunc func(r *IRReader) Artifact
)
// kind returns the [IRValueKind] encoded in h.
func (h irValueHeader) kind() IRValueKind {
return IRValueKind(h & irHeaderMask)
}
// value returns ancillary data encoded in h.
func (h irValueHeader) value() uint32 {
return uint32(h >> irHeaderShift)
}
// irArtifact refers to artifact IR interpretation functions and must not be
// written to directly.
var irArtifact = make(map[Kind]IRReadFunc)
// InvalidKindError is an unregistered [Kind] value.
type InvalidKindError Kind
func (e InvalidKindError) Error() string {
return "invalid artifact kind " + strconv.Itoa(int(e))
}
// register records the [IRReadFunc] of an implementation of [Artifact] under
// the specified [Kind]. Expecting to be used only during initialization, it
// panics if the mapping between [Kind] and [IRReadFunc] is not a bijection.
//
// register is not safe for concurrent use. register must not be called after
// the first instance of [Cache] has been opened.
func register(k Kind, f IRReadFunc) {
if _, ok := irArtifact[k]; ok {
panic("attempting to register " + strconv.Itoa(int(k)) + " twice")
}
irArtifact[k] = f
}
// Register records the [IRReadFunc] of a custom implementation of [Artifact]
// under the specified [Kind]. Expecting to be used only during initialization,
// it panics if the mapping between [Kind] and [IRReadFunc] is not a bijection,
// or the specified [Kind] is below [KindCustomOffset].
//
// Register is not safe for concurrent use. Register must not be called after
// the first instance of [Cache] has been opened.
func Register(k Kind, f IRReadFunc) {
if k < KindCustomOffset {
panic("attempting to register within internal kind range")
}
register(k, f)
}
// NewDecoder returns a new [IRDecoder] that reads from the [io.Reader].
func (c *Cache) NewDecoder(r io.Reader) *IRDecoder {
return &IRDecoder{c, r, make(map[unique.Handle[ID]]Artifact), false, false}
}
const (
// irMaxValues is the arbitrary maximum number of values allowed to be
// written by [Artifact.Params] and subsequently read via [IRReader].
irMaxValues = 1 << 12
// irMaxDeps is the arbitrary maximum number of direct dependencies allowed
// to be returned by [Artifact.Dependencies] and subsequently decoded by
// [IRDecoder].
irMaxDeps = 1 << 10
)
var (
// ErrIRValues is returned for an [Artifact] with too many parameter values.
ErrIRValues = errors.New("artifact has too many IR parameter values")
// ErrIRDepend is returned for an [Artifact] with too many dependencies.
ErrIRDepend = errors.New("artifact has too many dependencies")
// ErrAlreadyFinalised is returned when attempting to use an [IRReader] that
// has already been finalised.
ErrAlreadyFinalised = errors.New("reader has already finalised")
)
// enterReader panics with an appropriate error for an out-of-bounds count and
// must be called at some point in any exported method.
func (ir *IRReader) enterReader(read bool) {
if ir.count < 0 {
panic(ErrAlreadyFinalised)
}
if ir.count >= irMaxValues {
panic(ErrIRValues)
}
if read {
ir.count++
}
}
// IRKindError describes an attempt to read an IR value of unexpected kind.
type IRKindError struct {
Got, Want IRValueKind
Ancillary uint32
}
func (e *IRKindError) Error() string {
return fmt.Sprintf(
"got %s IR value (%#x) instead of %s",
e.Got, e.Ancillary, e.Want,
)
}
// readFull reads until either p is filled or an error is encountered.
func (ir *IRReader) readFull(p []byte) (n int, err error) {
for n < len(p) && err == nil {
var nn int
nn, err = ir.r.Read(p[n:])
n += nn
}
return
}
// mustRead reads from the underlying measured reader and panics on error. If
// an [io.EOF] is encountered and n != len(p), the error is promoted to a
// [io.ErrUnexpectedEOF], if n == 0, [io.EOF] is kept as is, otherwise it is
// zeroed.
func (ir *IRReader) mustRead(p []byte) {
n, err := ir.readFull(p)
if err == nil {
return
}
if errors.Is(err, io.EOF) {
if n == len(p) {
return
}
err = io.ErrUnexpectedEOF
}
panic(err)
}
// mustReadHeader reads the next header via d and checks its kind.
func (ir *IRReader) mustReadHeader(k IRValueKind) {
ir.mustRead(ir.buf[:])
ir.h = irValueHeader(binary.LittleEndian.Uint64(ir.buf[:]))
if wk := ir.h.kind(); wk != k {
panic(&IRKindError{wk, k, ir.h.value()})
}
}
// putAll returns all dependency buffers to the underlying [Cache].
func (ir *IRReader) putAll() {
for _, buf := range ir.deps {
ir.d.c.putIdentBuf(buf)
}
ir.deps = nil
}
// DiscardAll discards all unstructured dependencies. This is useful to
// implementations that encode dependencies as [IRKindIdent] which are read back
// via ReadIdent.
func (ir *IRReader) DiscardAll() {
if ir.deps == nil {
panic("attempting to discard dependencies twice")
}
ir.putAll()
}
// ErrDependencyDepleted is returned when attempting to advance to the next
// unstructured dependency when there are none left.
var ErrDependencyDepleted = errors.New("reading past end of dependencies")
// Next returns the next unstructured dependency.
func (ir *IRReader) Next() Artifact {
if len(ir.deps) == 0 {
panic(ErrDependencyDepleted)
}
id := unique.Make(ID(ir.deps[0][wordSize:]))
ir.d.c.putIdentBuf(ir.deps[0])
ir.deps = ir.deps[1:]
if a, ok := ir.d.ident[id]; !ok {
ir.putAll()
panic(DanglingIdentError(id))
} else {
return a
}
}
// MissedDependencyError is the number of unstructured dependencies remaining
// in [IRReader] that was never requested or explicitly discarded before
// finalisation.
type MissedDependencyError int
func (e MissedDependencyError) Error() string {
return "missed " + strconv.Itoa(int(e)) + " unstructured dependencies"
}
var (
// ErrUnexpectedChecksum is returned by a [IRReadFunc] that does not expect
// a checksum but received one in [IRKindEnd] anyway.
ErrUnexpectedChecksum = errors.New("checksum specified on unsupported artifact")
// ErrExpectedChecksum is returned by a [IRReadFunc] that expects a checksum
// but did not receive one in [IRKindEnd].
ErrExpectedChecksum = errors.New("checksum required but not specified")
)
// Finalise reads the final [IRKindEnd] value and marks r as finalised. Methods
// of r are invalid upon entry into Finalise. If a [Checksum] is available via
// [IREndKnownChecksum], its handle is returned and the caller must store its
// value in the resulting [Artifact].
func (ir *IRReader) Finalise() (checksum unique.Handle[Checksum], ok bool) {
ir.enterReader(true)
ir.count = -1
ir.mustReadHeader(IRKindEnd)
f := IREndFlag(ir.h.value())
if f&IREndKnownChecksum != 0 {
buf := ir.d.c.getIdentBuf()
defer ir.d.c.putIdentBuf(buf)
ir.mustRead(buf[wordSize:])
checksum = unique.Make(Checksum(buf[wordSize:]))
ok = true
}
if err := ir.ibw.Flush(); err != nil {
panic(err)
}
ir.r, ir.ibw = nil, nil
if len(ir.deps) != 0 {
panic(MissedDependencyError(len(ir.deps)))
}
return
}
// ReadIdent reads the next value as [IRKindIdent].
func (ir *IRReader) ReadIdent() Artifact {
ir.enterReader(true)
ir.mustReadHeader(IRKindIdent)
buf := ir.d.c.getIdentBuf()
defer ir.d.c.putIdentBuf(buf)
ir.mustRead(buf[wordSize:])
id := unique.Make(ID(buf[wordSize:]))
if a, ok := ir.d.ident[id]; !ok {
panic(DanglingIdentError(id))
} else {
return a
}
}
// ReadUint32 reads the next value as [IRKindUint32].
func (ir *IRReader) ReadUint32() uint32 {
ir.enterReader(true)
ir.mustReadHeader(IRKindUint32)
return ir.h.value()
}
// ReadStringBytes reads the next value as [IRKindString] but returns it as a
// byte slice instead.
func (ir *IRReader) ReadStringBytes() []byte {
ir.enterReader(true)
ir.mustReadHeader(IRKindString)
sz := int(ir.h.value())
szWire := alignSize(sz)
if szWire > irMaxStringLength {
panic(IRStringError("\x00"))
}
p := make([]byte, szWire)
ir.mustRead(p)
return p[:sz]
}
// ReadString reads the next value as [IRKindString].
func (ir *IRReader) ReadString() string {
p := ir.ReadStringBytes()
return unsafe.String(unsafe.SliceData(p), len(p))
}
// decode decodes the next [Artifact] in the IR stream and returns any buffer
// originating from [Cache] before returning. decode returns [io.EOF] if and
// only if the underlying [io.Reader] is already read to EOF.
func (d *IRDecoder) decode() (a Artifact, err error) {
defer panicToError(&err)
var ir IRReader
defer func() { ir.d = nil }()
ir.d = d
h := sha512.New384()
ir.ibw = d.c.getWriter(h)
defer d.c.putWriter(ir.ibw)
ir.r = io.TeeReader(d.r, ir.ibw)
if n, _err := ir.readFull(ir.buf[:]); _err != nil {
if errors.Is(_err, io.EOF) {
if n != 0 {
_err = io.ErrUnexpectedEOF
}
}
err = _err
return
}
ak := Kind(binary.LittleEndian.Uint64(ir.buf[:]))
f, ok := irArtifact[ak]
if !ok {
err = InvalidKindError(ak)
return
}
defer ir.putAll()
ir.mustRead(ir.buf[:])
sz := binary.LittleEndian.Uint64(ir.buf[:])
if sz > irMaxDeps {
err = ErrIRDepend
return
}
ir.deps = make([]*extIdent, sz)
for i := range ir.deps {
ir.deps[i] = d.c.getIdentBuf()
}
for _, buf := range ir.deps {
ir.mustRead(buf[:])
}
a = f(&ir)
if a == nil {
err = syscall.ENOTRECOVERABLE
return
}
if ir.count != -1 {
err = ErrRemainingIR
return
}
buf := d.c.getIdentBuf()
h.Sum(buf[wordSize:wordSize])
id := unique.Make(ID(buf[wordSize:]))
d.c.putIdentBuf(buf)
if _, ok = d.ident[id]; !ok {
d.ident[id] = a
} else {
d.c.msg.Verbosef(
"artifact %s appeared more than once in IR stream",
Encode(id.Value()),
)
}
return
}
// Decode consumes the IR stream to EOF and returns the final [Artifact]. After
// Decode returns, Lookup is available and Decode must not be called again.
func (d *IRDecoder) Decode() (a Artifact, err error) {
if d.done {
panic("attempting to decode an IR stream twice")
}
defer func() { d.done = true }()
var cur Artifact
next:
a, err = d.decode()
if err == nil {
cur = a
goto next
}
if errors.Is(err, io.EOF) {
a, err = cur, nil
d.ok = true
}
return
}
// Lookup looks up an [Artifact] described by the IR stream by its identifier.
func (d *IRDecoder) Lookup(id unique.Handle[ID]) (a Artifact, ok bool) {
if !d.ok {
panic("attempting to look up artifact without full IR stream")
}
a, ok = d.ident[id]
return
}