commit 39cea7b77c3d8a5f2596441c306a7270f4894ebc Author: Carl Pearson Date: Mon Dec 30 05:36:42 2024 -0700 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fef2a97 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +subreddits +go.sum diff --git a/README.md b/README.md new file mode 100644 index 0000000..13c02a6 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# reimager for Reddit + +Download images from reddit. + +```bash +go mod tidy + +go run main.go +``` \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..93e33db --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module github.com/cwpearson/reddit-images + +go 1.23.0 + +require ( + github.com/gabriel-vasile/mimetype v1.4.7 // indirect + golang.org/x/net v0.31.0 // indirect +) diff --git a/main.go b/main.go new file mode 100644 index 0000000..e130060 --- /dev/null +++ b/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "log" + "time" + + "github.com/cwpearson/reddit-images/rate_limit" + "github.com/cwpearson/reddit-images/reddit" +) + +func main() { + + subreddits := []string{ + "pics", + "oldschoolcool", + "thewaywewere", + "MilitaryPorn", + "EarthPorn", + } + + rl := rate_limit.NewRateLimit() + + for { + for _, subreddit := range subreddits { + r := reddit.NewReddit(rl, subreddit) + r.Get() + } + + when := time.Now().Add(time.Minute * time.Duration(30)) + log.Println("sleep until", when) + time.Sleep(time.Until(when)) + } + +} diff --git a/rate_limit/rate_limit.go b/rate_limit/rate_limit.go new file mode 100644 index 0000000..f1cd410 --- /dev/null +++ b/rate_limit/rate_limit.go @@ -0,0 +1,124 @@ +package rate_limit + +import ( + "fmt" + "io" + "log" + "net/http" + "strconv" + "time" +) + +type RateLimit struct { + client *http.Client + used int + remaining int + reset time.Time +} + +func getFutureTime(seconds int) time.Time { + return time.Now().Add(time.Duration(seconds) * time.Second) +} + +func sleepUntil(when time.Time) { + now := time.Now() + if now.After(when) { + return + } + time.Sleep(when.Sub(now)) +} + +func NewRateLimit() *RateLimit { + return &RateLimit{ + client: &http.Client{}, + remaining: 100, + reset: time.Now(), + } +} + +func (rl *RateLimit) UpdateUsed(used string) { + val, err := strconv.ParseFloat(used, 64) + if err == nil { + rl.used = int(val) + log.Println("used ->", rl.used) + } +} + +func (rl *RateLimit) UpdateRemaining(used string) { + val, err := strconv.ParseFloat(used, 64) + if err == nil { + rl.remaining = int(val) + log.Println("remaining ->", rl.remaining) + } +} + +func (rl *RateLimit) UpdateReset(used string) { + val, err := strconv.ParseFloat(used, 64) + if err == nil { + maybe := getFutureTime(int(val)) + if rl.reset.Before(maybe) { + rl.reset = maybe + log.Println("reset ->", rl.reset) + } + } +} + +// Get makes an HTTP GET request to the specified URL while respecting rate limits +func (rl *RateLimit) Get(url string, accept string) ([]byte, error) { + if rl.remaining <= 0 { + log.Println("no requests remaining, sleep until", rl.reset) + sleepUntil(rl.reset) + } + + // Create new request + log.Println("GET", url) + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, fmt.Errorf("failed to make request: %w", err) + } + + // Add any required headers + req.Header.Add("User-Agent", "linux:reddit-images:0.1") + if accept != "" { + req.Header.Add("Accept", accept) + } + + // Make the request + resp, err := rl.client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to make request: %w", err) + } + defer resp.Body.Close() + + // parse rate limit + used := resp.Header.Get("X-Ratelimit-Used") + remaining := resp.Header.Get("X-Ratelimit-Remaining") + reset := resp.Header.Get("X-Ratelimit-Reset") + // fmt.Printf("User Limit: %s\n", used) + // fmt.Printf("Remaining: %s\n", remaining) + // fmt.Printf("Reset: %s\n", reset) + rl.UpdateUsed(used) + rl.UpdateRemaining(remaining) + rl.UpdateReset(reset) + + if resp.StatusCode == http.StatusTooManyRequests { + rl.remaining = 0 + + // if the reset time is before now, just pick a while in the future for the next retry + if rl.reset.Before(time.Now()) { + rl.UpdateReset("450") + } + return nil, fmt.Errorf("request failed with 429") + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("request failed with status: %d", resp.StatusCode) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + return body, nil +} diff --git a/reddit/gallery.go b/reddit/gallery.go new file mode 100644 index 0000000..75be61d --- /dev/null +++ b/reddit/gallery.go @@ -0,0 +1,54 @@ +package reddit + +import ( + "encoding/json" + "fmt" + "log" + + "github.com/cwpearson/reddit-images/rate_limit" +) + +type GListing struct { + Data GListingData `json:"data"` +} + +type GListingData struct { + Children []GListingDataChild `json:"children"` +} + +type GListingDataChild struct { + Data GListingDataChildData `json:"data"` +} + +type GListingDataChildData struct { + MediaMetadata map[string]Metadata `json:"media_metadata"` +} + +type Metadata struct { + Id string `json:"id"` + Mimetype string `json:"m"` +} + +func GalleryImageMetadata(rl *rate_limit.RateLimit, url string) ([]Metadata, error) { + jsonUrl := fmt.Sprintf("%s.json?raw_json=1", url) + log.Printf("gallery url: %s -> %s", url, jsonUrl) + + content, err := rl.Get(jsonUrl, "") + if err != nil { + return nil, err + } + + var data []GListing + err = json.Unmarshal(content, &data) + if err != nil { + return nil, err + } + + res := []Metadata{} + + for _, val := range data[0].Data.Children[0].Data.MediaMetadata { + res = append(res, val) + } + + return res, nil +} diff --git a/reddit/reddit.go b/reddit/reddit.go new file mode 100644 index 0000000..02f6f05 --- /dev/null +++ b/reddit/reddit.go @@ -0,0 +1,249 @@ +package reddit + +import ( + "encoding/json" + "fmt" + "log" + "net/url" + "os" + "path/filepath" + "strings" + "time" + + "github.com/cwpearson/reddit-images/rate_limit" + "github.com/gabriel-vasile/mimetype" +) + +// Response represents the outer JSON structure +type Response struct { + Data ListingData `json:"data"` +} + +// ListingData represents the data field containing children +type ListingData struct { + After string `json:"after"` + Children []Child `json:"children"` +} + +// Child represents each item in the children array +type Child struct { + Kind string `json:"kind"` + Data ChildData `json:"data"` +} + +// ChildData represents the nested data in each child +type ChildData struct { + Title string `json:"title"` + Author string `json:"author"` + URLOverriddenByDest string `json:"url_overridden_by_dest"` + URL string `json:"url"` + Created float64 `json:"created"` + Id string `json:"id"` +} + +type Reddit struct { + subreddit string + retries int + rl *rate_limit.RateLimit +} + +func NewReddit(rl *rate_limit.RateLimit, subreddit string) *Reddit { + return &Reddit{ + subreddit: subreddit, + retries: 3, + rl: rl, + } +} + +// returns children, after, error +func (r *Reddit) Next(after string) ([]ChildData, string, error) { + + baseURL := fmt.Sprintf("https://reddit.com/r/%s/hot.json", r.subreddit) + + u, err := url.Parse(baseURL) + if err != nil { + panic(err) + } + + // Create query parameters + params := url.Values{} + params.Add("raw_json", "1") + params.Add("limit", "100") + if after != "" { + params.Add("after", after) + } + + // Add the query parameters to the URL + u.RawQuery = params.Encode() + + var body []byte + for try := 0; try < r.retries; try++ { + body, err = r.rl.Get(u.String(), "") + if err != nil { + fmt.Printf("Error getting subreddit: %v\n", err) + body = nil + time.Sleep(time.Second * time.Duration(5)) + continue + } + break + } + if body == nil { + return nil, "", fmt.Errorf("retries exceeded") + } + response := Response{} + err = json.Unmarshal(body, &response) + if err != nil { + return nil, "", err + } + + res := []ChildData{} + for _, c := range response.Data.Children { + if c.Kind == "t3" { // link + res = append(res, c.Data) + } + } + return res, response.Data.After, nil +} + +func getImage(rl *rate_limit.RateLimit, url, outDir, stem string) error { + contents, err := rl.Get(url, "image/*") + if err != nil { + return err + } + mtype := mimetype.Detect(contents) + name := stem + mtype.Extension() + outPath := filepath.Join(outDir, name) + log.Println("write", outPath) + return os.WriteFile(outPath, contents, 0644) +} + +func (r *Reddit) Get() { + var children []ChildData + var err error + + outDir := filepath.Join("subreddits", r.subreddit) + err = os.MkdirAll(outDir, 0755) + if err != nil && !os.IsExist(err) { + log.Println("ERROR: couldn't create out directory", outDir) + return + } + + // load existing names + existing := map[string]struct{}{} + entries, err := os.ReadDir(outDir) + if err != nil { + log.Println("ERROR: couldn't read directory", outDir) + return + } + for _, entry := range entries { + if entry.IsDir() { + continue + } + filename := entry.Name() + nameWithoutSuffix := strings.TrimSuffix(filename, filepath.Ext(filename)) + existing[nameWithoutSuffix] = struct{}{} + } + + children, _, err = r.Next("") + if err != nil { + log.Println("ERROR: Next error:", err) + return + } + + for _, child := range children { + log.Println("Title:", child.Title) + shortTitle := child.Title + if len(shortTitle) > 32 { + shortTitle = shortTitle[0:32] + } + shortTitle = sanitizeFilename(shortTitle) + + if strings.Contains(child.URLOverriddenByDest, "www.reddit.com/gallery") { + + metas, err := GalleryImageMetadata(r.rl, child.URLOverriddenByDest) + if err != nil { + log.Println("ERROR: Gallery handling error:", err) + continue + } + + log.Println("Gallery metas:", metas) + + for mi, meta := range metas { + parts := strings.Split(meta.Mimetype, "/") + if len(parts) == 2 { + imgUrl := fmt.Sprintf("https://i.redd.it/%s.%s", meta.Id, parts[1]) + + stem := fmt.Sprintf("%d_%s_%d_%s", int64(child.Created), shortTitle, mi, meta.Id) + + if _, ok := existing[stem]; ok { + log.Println(stem, "already downloaded") + continue + } + + err := getImage(r.rl, imgUrl, outDir, stem) + if err != nil { + log.Println("ERROR: getImage:", err) + continue + } + existing[stem] = struct{}{} + } + } + + continue + } else { + + stem := fmt.Sprintf("%d_%s_%s", int64(child.Created), shortTitle, child.Id) + + if _, ok := existing[stem]; ok { + log.Println(stem, "already downloaded") + continue + } + + err := getImage(r.rl, child.URLOverriddenByDest, outDir, stem) + if err != nil { + log.Println("ERROR: getImage:", err) + continue + } + existing[stem] = struct{}{} + } + } +} + +func sanitizeFilename(input string) string { + // Replace path separators and problematic characters + replacer := strings.NewReplacer( + "/", "", + "\\", "", + ":", "", + "*", "", + "?", "", + "\"", "", + "<", "", + ">", "", + "|", "", + ",", "", + ";", "", + "\x00", "", // null byte + " ", "-", // replace spaces with hyphens + ) + cleaned := replacer.Replace(input) + + // Remove non-ASCII characters + var result strings.Builder + for _, r := range cleaned { + if r < 128 { // Keep only ASCII characters + result.WriteRune(r) + } + } + cleaned = result.String() + + // Trim spaces (though they should already be replaced with hyphens) + cleaned = strings.TrimSpace(cleaned) + + // If the filename becomes empty after cleaning, provide a default + if cleaned == "" { + return "unnamed_file" + } + + return cleaned +}