README | 17 +++++++++++++++++ cmd/dl/main.go | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/get-ids | 7 +++++++ cmd/ls/main.go | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/mk-ids/main.go | 27 +++++++++++++++++++++++++++ const.go | 7 +++++++ go.mod | 5 +++++ go.sum | 2 ++ diff --git a/README b/README new file mode 100644 index 0000000000000000000000000000000000000000..53edaa2e0ca0aa0de45327befdb4b2f3f697cf62b94c89cab014e98a18aa4c1a --- /dev/null +++ b/README @@ -0,0 +1,17 @@ +arXiv downloader. +Get list of XMLs using https://github.com/miku/metha: + + $ metha-sync -no-compression -base-dir ... http://export.arxiv.org/oai2 + +Then get list of ids: + + $ parallel "cmd/get-ids {} >{}.ids" ::: *.xml + +Then generate state databases: + + $ cat *.ids | cmd/mk-ids >ids + $ num=$(wc -l arxiv.MaxFnLen { + log.Fatalln("too long filename:", dst) + } + { + var fn [arxiv.MaxFnLen]byte + copy(fn[:], []byte(dst)) + FnsM.Lock() + if _, err = Fns.Seek(arxiv.MaxFnLen*int64(num), io.SeekStart); err != nil { + log.Fatal(err) + } + if _, err = Fns.Write(fn[:]); err != nil { + log.Fatal(err) + } + FnsM.Unlock() + } + s = resp.Header.Get("Content-Length") + if s == "" { + log.Println(num, "no Content-Length:", u, resp.Header) + return + } + size, err := strconv.ParseUint(s, 10, 64) + if err != nil { + log.Println(num, "parse Content-Length:", u, s, err) + return + } + dst = path.Join(Out, dst) + if s, err := os.Stat(dst); err == nil { + if uint64(s.Size()) == size { + log.Println(num, "exists:", dst) + codeWrite(num, resp.StatusCode) + return + } + log.Println(num, "size mismatch:", dst) + } + fh, err := os.OpenFile(dst+".tmp", os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0o666) + if err != nil { + log.Fatal(err) + } + _, err = io.CopyN(fh, resp.Body, int64(size)) + resp.Body.Close() + if err != nil { + log.Print(num, "copy:", dst, err) + fh.Close() + return + } + if err = fh.Close(); err != nil { + log.Fatal(err) + } + if err = os.Rename(dst+".tmp", dst); err != nil { + log.Fatal(err) + } + codeWrite(num, resp.StatusCode) + log.Println(num, "ok:", dst, humanize.IBytes(size)) +} + +func main() { + idsPth := flag.String("ids", "ids", "Path to ids database") + codesPth := flag.String("codes", "codes", "Path to codes database") + fnsPth := flag.String("fns", "fns", "Path to fns database") + outPth := flag.String("out", "out", "Path to out/ directory") + skip := flag.Uint("skip", 0, "Skip those records") + jobsNum := flag.Uint("jobs", 1, "Number of jobs") + flag.Parse() + var err error + Ids, err = os.Open(*idsPth) + if err != nil { + log.Fatal(err) + } + Codes, err = os.OpenFile(*codesPth, os.O_RDWR, 0o666) + if err != nil { + log.Fatal(err) + } + Fns, err = os.OpenFile(*fnsPth, os.O_RDWR, 0o666) + if err != nil { + log.Fatal(err) + } + Out = *outPth + Jobs = make(chan struct{}, *jobsNum) + for range *jobsNum { + Jobs <- struct{}{} + } + id := make([]byte, arxiv.MaxIdLen) + codeRaw := make([]byte, arxiv.CodeLen) + num := int(*skip) + var code uint16 + var idIdx int + var s string + br := bufio.NewReader(Ids) + if _, err = br.Discard(num * arxiv.MaxIdLen); err != nil { + log.Fatal(err) + } + for { + if _, err = io.ReadFull(br, id); err != nil { + log.Fatal(err) + } + idIdx = bytes.Index(id, []byte{0}) + if idIdx == -1 { + idIdx = arxiv.MaxIdLen + } + s = string(id[:idIdx]) + CodesM.Lock() + if _, err = Codes.Seek(arxiv.CodeLen*int64(num), io.SeekStart); err != nil { + log.Fatal(err) + } + if _, err = io.ReadFull(Codes, codeRaw); err != nil { + log.Fatal(err) + } + CodesM.Unlock() + code = binary.BigEndian.Uint16(codeRaw) + switch code { + case 0, http.StatusNotAcceptable: + <-Jobs + go func(num int, u string) { + fetch(num, u) + Jobs <- struct{}{} + }(num, s) + case http.StatusOK, http.StatusNotFound: + default: + log.Fatalln("unsupported code:", code) + } + num++ + } +} diff --git a/cmd/get-ids b/cmd/get-ids new file mode 100755 index 0000000000000000000000000000000000000000..ec871c844f90befd9e1207b9f9ba4592bde318a1e83cab76f33bdb7caf76180b --- /dev/null +++ b/cmd/get-ids @@ -0,0 +1,7 @@ +#!/bin/sh +# Get list of ids from XML. + +xmllint --format $1 | +sed -n "s/^.*dc:identifier.\(https:.*\)<.*$/\1/p" | +grep https:..arxiv.org | +sed 's#^https://arxiv.org/abs/##' diff --git a/cmd/ls/main.go b/cmd/ls/main.go new file mode 100644 index 0000000000000000000000000000000000000000..1090e6b8be2814b93b2a3a0d38f7cdd31e6b871cba24ac33fb85eb664e48fa37 --- /dev/null +++ b/cmd/ls/main.go @@ -0,0 +1,73 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/binary" + "flag" + "fmt" + "io" + "log" + "os" + + "go.stargrave.org/arxiv" +) + +func main() { + idsPth := flag.String("ids", "ids", "Path to ids database") + codesPth := flag.String("codes", "codes", "Path to codes database") + fnsPth := flag.String("fns", "fns", "Path to fns database") + skip := flag.Uint("skip", 0, "Skip those records") + flag.Parse() + idsFh, err := os.Open(*idsPth) + if err != nil { + log.Fatal(err) + } + codesFh, err := os.OpenFile(*codesPth, os.O_RDWR, 0o666) + if err != nil { + log.Fatal(err) + } + fnsFh, err := os.OpenFile(*fnsPth, os.O_RDWR, 0o666) + if err != nil { + log.Fatal(err) + } + ids := bufio.NewReader(idsFh) + codes := bufio.NewReader(codesFh) + fns := bufio.NewReader(fnsFh) + id := make([]byte, arxiv.MaxIdLen) + code := make([]byte, arxiv.CodeLen) + fn := make([]byte, arxiv.MaxFnLen) + num := int(*skip) + if _, err = ids.Discard(num * arxiv.MaxIdLen); err != nil { + log.Fatal(err) + } + if _, err = codes.Discard(num * arxiv.CodeLen); err != nil { + log.Fatal(err) + } + if _, err = fns.Discard(num * arxiv.MaxFnLen); err != nil { + log.Fatal(err) + } + + var idIdx, fnIdx int + for { + if _, err = io.ReadFull(ids, id); err != nil { + log.Fatal(err) + } + if _, err = io.ReadFull(codes, code); err != nil { + log.Fatal(err) + } + if _, err = io.ReadFull(fns, fn); err != nil { + log.Fatal(err) + } + idIdx = bytes.Index(id, []byte{0}) + if idIdx == -1 { + idIdx = arxiv.MaxIdLen + } + fnIdx = bytes.Index(fn, []byte{0}) + if fnIdx == -1 { + fnIdx = arxiv.MaxFnLen + } + fmt.Println(num, string(id[:idIdx]), string(fn[:fnIdx]), binary.BigEndian.Uint16(code)) + num++ + } +} diff --git a/cmd/mk-ids/main.go b/cmd/mk-ids/main.go new file mode 100644 index 0000000000000000000000000000000000000000..13b2b3e98744b7ea51f1da95298ce9fe37d5a249c8ecbf28213f5c7d7ba4a472 --- /dev/null +++ b/cmd/mk-ids/main.go @@ -0,0 +1,27 @@ +package main + +import ( + "bufio" + "log" + "os" + + "go.stargrave.org/arxiv" +) + +func main() { + s := bufio.NewScanner(os.Stdin) + bw := bufio.NewWriter(os.Stdout) + fn := make([]byte, arxiv.MaxIdLen) + for s.Scan() { + if len(s.Bytes()) > arxiv.MaxIdLen { + log.Fatal("too long") + } + copy(fn, s.Bytes()) + clear(fn[len(s.Bytes()):]) + bw.Write(fn) + } + if err := s.Err(); err != nil { + log.Fatal(err) + } + bw.Flush() +} diff --git a/const.go b/const.go new file mode 100644 index 0000000000000000000000000000000000000000..2058c38edf365936122ccf5aa89db9d5a9d4636ab4b589eb4055005c06574f2b --- /dev/null +++ b/const.go @@ -0,0 +1,7 @@ +package arxiv + +const ( + MaxFnLen = 64 + MaxIdLen = 16 + CodeLen = 2 +) diff --git a/go.mod b/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..9d7ab69124012acdeb93a994e51d8b40b84875c1b69988480a0a17e744fa6b94 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module go.stargrave.org/arxiv + +go 1.26.3 + +require github.com/dustin/go-humanize v1.0.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000000000000000000000000000000000000..96334064e1f54c43b73a6cbd80267698aac7056087d4b3ad4f9156f293de19a1 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=