cmd/msrfetch: fetch from enriched composite data

This creates a content-addressed store for all media files. It is not
yet usable but an API and cdn backend will be implemented from it in a
later commit, as well as tooling for export and tagging.

Signed-off-by: Yonah <contrib@gensokyo.uk>
This commit is contained in:
Yonah 2025-09-18 07:12:10 +09:00
parent 21871a387c
commit f0fb5d9d2e
Signed by: yonah
SSH Key Fingerprint: SHA256:vnQvK8+XXH9Tbni2AV1a/8qdVK/zPcXw52GM0ruQvwA
5 changed files with 1308 additions and 1 deletions

3
.gitignore vendored
View File

@ -7,6 +7,9 @@
*.pkg *.pkg
/msrfetch /msrfetch
# Content-addressed media files
/data
# Test binary, built with `go test -c` # Test binary, built with `go test -c`
*.test *.test

147
cmd/msrfetch/fetch.go Normal file
View File

@ -0,0 +1,147 @@
package main
import (
"bytes"
"context"
"crypto/sha512"
"encoding/hex"
"flag"
"io"
"log"
"os"
"path"
"slices"
"sync"
"git.gensokyo.uk/yonah/monstersirenfetch"
)
var (
flagFetchDirPath string
flagMaxConnections int
)
func init() {
flag.StringVar(&flagFetchDirPath, "d", "data", "Path to write content-addressed media files to")
flag.IntVar(&flagMaxConnections, "j", 1, "Maximum number of simultaneous connections allowed")
}
func mustFetch(ctx context.Context) {
var c monstersirenfetch.CompositeAlbumsMap
mustReadJSON(flagOutputPath, &c)
const (
invalidContainsNil = "invalid composite data"
)
var urls []string
for _, ca := range c {
if ca.Album == nil {
log.Fatal(invalidContainsNil)
}
if ca.CoverURL == "" {
log.Fatalf("album %s missing coverUrl", ca.CID.String())
}
urls = append(urls, ca.CoverURL)
for _, cs := range ca.Songs {
if cs == nil {
log.Fatal(invalidContainsNil)
}
if !cs.IsFull() {
log.Fatal("this composite is not enriched")
}
urls = append(urls, cs.SourceURL)
if cs.LyricURL != "" {
urls = append(urls, cs.LyricURL)
}
if cs.MvURL != "" {
urls = append(urls, cs.MvURL)
}
if cs.MvCoverURL != "" {
urls = append(urls, cs.MvCoverURL)
}
}
}
slices.Sort(urls)
urls = slices.Compact(urls)
if err := os.MkdirAll(flagFetchDirPath, 0755); err != nil {
log.Fatal(err)
}
n := new(netDirect)
var (
wg sync.WaitGroup
mu sync.RWMutex
uc = make(chan string)
urlMap = make(map[[sha512.Size]byte]string, len(urls))
urlMapHs = make(map[string]string, len(urls))
)
if flagMaxConnections < 1 {
log.Fatalf("%d out of range", flagMaxConnections)
}
log.Printf("fetching %d files across %d connections", len(urls), flagMaxConnections)
for i := 0; i < flagMaxConnections; i++ {
wg.Add(1)
go func(t int) {
defer wg.Done()
for u := range uc {
buf := new(bytes.Buffer)
if r, l, err := n.Get(ctx, u); err != nil {
log.Fatal(err)
} else {
if v := int(l); v > 0 {
buf.Grow(v)
}
if _, err = io.Copy(buf, r); err != nil {
if closeErr := r.Close(); closeErr != nil {
log.Print(closeErr)
}
log.Fatal(err)
}
if err = r.Close(); err != nil {
log.Fatal(err)
}
}
s := sha512.Sum512(buf.Bytes())
hs := hex.EncodeToString(s[:])
mu.RLock()
if v, ok := urlMap[s]; ok {
log.Fatalf("file %s and %s has identical content", u, v)
}
mu.RUnlock()
mu.Lock()
urlMap[s] = u
urlMapHs[hs] = u
if err := os.WriteFile(
path.Join(flagFetchDirPath, hs),
buf.Bytes(), 0644); err != nil {
log.Fatal(err)
}
mu.Unlock()
log.Printf("%s created from %s (%d)", hs, u, t)
}
}(i)
}
for _, u := range urls {
uc <- u
}
close(uc)
wg.Wait()
{
pathname := path.Join(flagFetchDirPath, "map")
mustWriteJSON(pathname, urlMapHs)
log.Println("map written to", pathname)
}
}

View File

@ -28,6 +28,9 @@ func main() {
case "enrich": case "enrich":
mustEnrich(ctx) mustEnrich(ctx)
case "fetch":
mustFetch(ctx)
default: default:
log.Fatalf("%q is not a valid command", flag.Args()[0]) log.Fatalf("%q is not a valid command", flag.Args()[0])
} }

1155
testdata/output/fetch.log vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -889,7 +889,6 @@ https://web.hycdn.cn/siren/lyric/20250205/9a5eddbaabd7721be7ebbf6a0e272183.lrc
https://web.hycdn.cn/siren/lyric/20250307/44662fd516a0839394abb5616f44b584.lrc https://web.hycdn.cn/siren/lyric/20250307/44662fd516a0839394abb5616f44b584.lrc
https://web.hycdn.cn/siren/lyric/20250407/30c96fe96915b0f15552926166610319.lrc https://web.hycdn.cn/siren/lyric/20250407/30c96fe96915b0f15552926166610319.lrc
https://web.hycdn.cn/siren/lyric/20250415/d38345245de559efccfee217dec2890f.lrc https://web.hycdn.cn/siren/lyric/20250415/d38345245de559efccfee217dec2890f.lrc
https://web.hycdn.cn/siren/lyric/20250415/d38345245de559efccfee217dec2890f.lrc
https://web.hycdn.cn/siren/lyric/20250430/19f12a6e5b193f4010f91e1bd82518ba.lrc https://web.hycdn.cn/siren/lyric/20250430/19f12a6e5b193f4010f91e1bd82518ba.lrc
https://web.hycdn.cn/siren/lyric/20250430/4d584eea0711d292b5d170d47cad7b55.lrc https://web.hycdn.cn/siren/lyric/20250430/4d584eea0711d292b5d170d47cad7b55.lrc
https://web.hycdn.cn/siren/lyric/20250430/cfd84e70b19dc9a47fdf3d4c00341a63.lrc https://web.hycdn.cn/siren/lyric/20250430/cfd84e70b19dc9a47fdf3d4c00341a63.lrc