87 lines
1.9 KiB
Go
87 lines
1.9 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"log"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"git.rpuzonas.com/rpuzonas/go-rss-aggregator/internal/database"
|
|
)
|
|
|
|
func startScraping(
|
|
db *database.Queries,
|
|
concurrency int,
|
|
timeBetweenRequests time.Duration,
|
|
) {
|
|
log.Printf("Scraping on %v goroutines every %s duration", concurrency, timeBetweenRequests)
|
|
|
|
ticker := time.NewTicker(timeBetweenRequests)
|
|
for ; ; <-ticker.C {
|
|
feeds, err := db.GetNextFeedToFetch(context.Background(), int64(concurrency))
|
|
if err != nil {
|
|
log.Println("Error fetching feeds:", err)
|
|
continue
|
|
}
|
|
|
|
wg := sync.WaitGroup{}
|
|
for _, feed := range feeds {
|
|
wg.Add(1)
|
|
go scrapeFeed(&wg, db, feed)
|
|
}
|
|
wg.Wait()
|
|
}
|
|
}
|
|
|
|
func scrapeFeed(wg *sync.WaitGroup, db *database.Queries, feed database.Feed) {
|
|
defer wg.Done()
|
|
|
|
ctx := context.Background()
|
|
|
|
_, err := db.MarkFeedAsFetched(ctx, feed.ID)
|
|
if err != nil {
|
|
log.Println("Error makrking feed as fetched:", err)
|
|
return
|
|
}
|
|
|
|
rssFeed, err := urlToFeed(feed.Url)
|
|
if err != nil {
|
|
log.Println("Error fetching feed:", err)
|
|
return
|
|
}
|
|
|
|
for _, item := range rssFeed.Channel.Items {
|
|
description := sql.NullString{}
|
|
if item.Description != "" {
|
|
description.String = item.Description
|
|
description.Valid = true
|
|
}
|
|
|
|
publishedAt, err := time.Parse(time.RFC1123Z, item.PubDate)
|
|
if err != nil {
|
|
log.Printf("Couldn't parse date %v with err %v\n", item.PubDate, err)
|
|
continue
|
|
}
|
|
|
|
now := time.Now().UTC()
|
|
_, err = db.CreatePost(ctx, database.CreatePostParams{
|
|
UpdatedAt: now,
|
|
CreatedAt: now,
|
|
Title: item.Title,
|
|
Description: description,
|
|
PublishedAt: publishedAt,
|
|
Url: item.Link,
|
|
FeedID: feed.ID,
|
|
})
|
|
if err != nil {
|
|
if strings.Contains(err.Error(), "UNIQUE constraint failed: posts.url") {
|
|
continue
|
|
}
|
|
log.Println("Failed to add post:", err)
|
|
}
|
|
}
|
|
log.Printf("Feed %s collected, %v posts found", feed.Name, len(rssFeed.Channel.Items))
|
|
}
|