1
0
go-rss-aggregator/scraper.go

87 lines
1.9 KiB
Go

package main
import (
"context"
"database/sql"
"log"
"strings"
"sync"
"time"
"git.rpuzonas.com/rpuzonas/go-rss-aggregator/internal/database"
)
func startScraping(
db *database.Queries,
concurrency int,
timeBetweenRequests time.Duration,
) {
log.Printf("Scraping on %v goroutines every %s duration", concurrency, timeBetweenRequests)
ticker := time.NewTicker(timeBetweenRequests)
for ; ; <-ticker.C {
feeds, err := db.GetNextFeedToFetch(context.Background(), int64(concurrency))
if err != nil {
log.Println("Error fetching feeds:", err)
continue
}
wg := sync.WaitGroup{}
for _, feed := range feeds {
wg.Add(1)
go scrapeFeed(&wg, db, feed)
}
wg.Wait()
}
}
func scrapeFeed(wg *sync.WaitGroup, db *database.Queries, feed database.Feed) {
defer wg.Done()
ctx := context.Background()
_, err := db.MarkFeedAsFetched(ctx, feed.ID)
if err != nil {
log.Println("Error makrking feed as fetched:", err)
return
}
rssFeed, err := urlToFeed(feed.Url)
if err != nil {
log.Println("Error fetching feed:", err)
return
}
for _, item := range rssFeed.Channel.Items {
description := sql.NullString{}
if item.Description != "" {
description.String = item.Description
description.Valid = true
}
publishedAt, err := time.Parse(time.RFC1123Z, item.PubDate)
if err != nil {
log.Printf("Couldn't parse date %v with err %v\n", item.PubDate, err)
continue
}
now := time.Now().UTC()
_, err = db.CreatePost(ctx, database.CreatePostParams{
UpdatedAt: now,
CreatedAt: now,
Title: item.Title,
Description: description,
PublishedAt: publishedAt,
Url: item.Link,
FeedID: feed.ID,
})
if err != nil {
if strings.Contains(err.Error(), "UNIQUE constraint failed: posts.url") {
continue
}
log.Println("Failed to add post:", err)
}
}
log.Printf("Feed %s collected, %v posts found", feed.Name, len(rssFeed.Channel.Items))
}