Add fetch paid articles

This commit is contained in:
dolphinau 2025-07-02 17:50:54 +02:00
parent 4a20c539f6
commit 06dafb24fc
No known key found for this signature in database
2 changed files with 35 additions and 16 deletions

View file

@ -12,3 +12,4 @@ reqwest = "0.12.22"
scraper = "0.23.1" scraper = "0.23.1"
regex = "1.11.1" regex = "1.11.1"
chrono = "0.4.41" chrono = "0.4.41"
rss = "2.0.12"

View file

@ -1,41 +1,59 @@
use std::error::Error;
use chrono::NaiveDate; use chrono::NaiveDate;
use regex::Regex; use regex::Regex;
use reqwest::get; use reqwest::get;
use rss::Channel;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use tokio::runtime::Runtime; use tokio::{runtime::Runtime, sync::mpsc::unbounded_channel};
fn main() { fn main() {
let rt = Runtime::new().unwrap(); let rt = Runtime::new().unwrap();
rt.block_on(fetch_release_date("https://lwn.net/Articles/1025629/"));
rt.block_on(async {
if let Ok(articles) = fetch_paid_article_urls().await {
for article in articles {
if let Ok(Some(date)) = fetch_release_date(&article).await {
// TODO
println!("Snooze {} to {}", article, date);
}
}
}
});
} }
async fn fetch_release_date(url: &str) -> Option<NaiveDate> { async fn fetch_release_date(url: &str) -> Result<Option<NaiveDate>, Box<dyn Error>> {
let response = get(url).await.unwrap(); let response = get(url).await?.text().await?;
let response_text = response.text().await.unwrap();
if let Some(article_text) = Html::parse_document(&response_text) if let Some(article_text) = Html::parse_document(&response)
.select(&Selector::parse("div.ArticleText").unwrap()) .select(&Selector::parse("div.ArticleText")?)
.next() .next()
{ {
if let Some(yes) = article_text.select(&Selector::parse("p").unwrap()).last() { if let Some(yes) = article_text.select(&Selector::parse("p")?).last() {
let re = Regex::new( let re = Regex::new(
r#"(?m)\(Alternatively, this item will become freely\n\s* available on ([A-Z][a-z]+ [0-9]{2}, [0-9]{4})\)"#, r#"(?m)\(Alternatively, this item will become freely\n\s* available on ([A-Z][a-z]+ [0-9]{2}, [0-9]{4})\)"#,
) )?;
.unwrap();
if let Some(cap) = re.captures(&yes.inner_html()) { if let Some(cap) = re.captures(&yes.inner_html()) {
if let Some(date) = cap.get(1) { if let Some(date) = cap.get(1) {
return NaiveDate::parse_from_str(date.as_str(), "%B %d, %Y").ok(); let date = NaiveDate::parse_from_str(date.as_str(), "%B %d, %Y")?;
return Ok(Some(date));
} }
} }
} }
} }
None Ok(None)
} }
async fn fetch_paid_articles() -> Option<Vec<String>> { async fn fetch_paid_article_urls() -> Result<Vec<String>, Box<dyn Error>> {
let response = get("https://lwn.net/headlines/rss").await.unwrap(); let response = get("https://lwn.net/headlines/rss").await?.bytes().await?;
let response_text = response.text().await.unwrap(); let channel = Channel::read_from(&response[..])?;
None Ok(channel
.items()
.iter()
.filter(|i| i.title().unwrap_or("").starts_with("[$]"))
.filter_map(|i| i.link())
.map(|s| s.to_string())
.collect::<Vec<String>>())
} }