Skip to content

Commit

Permalink
Differen AI System prompt for Youtube videos
Browse files Browse the repository at this point in the history
  • Loading branch information
mithandir committed Sep 3, 2024
1 parent 1506e47 commit 941b528
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 33 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package ch.climbd.newsfeed.controller.scheduler;
package ch.climbd.newsfeed.controller;

import io.github.thoroldvix.api.TranscriptRetrievalException;
import io.github.thoroldvix.api.YoutubeClient;
Expand All @@ -17,7 +17,7 @@
/**
* Added HTTP proxy configuration for Youtube
*/
final class DefaultYoutubeClientCopy implements YoutubeClient {
public final class DefaultYoutubeClientCopy implements YoutubeClient {
private final HttpClient httpClient;

DefaultYoutubeClientCopy() {
Expand Down
43 changes: 41 additions & 2 deletions src/main/java/ch/climbd/newsfeed/controller/MlController.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package ch.climbd.newsfeed.controller;

import ch.climbd.newsfeed.data.NewsEntry;
import io.github.thoroldvix.api.TranscriptFormatters;
import io.github.thoroldvix.api.TranscriptList;
import io.github.thoroldvix.api.TranscriptRetrievalException;
import io.github.thoroldvix.api.YoutubeTranscriptApi;
import io.github.thoroldvix.internal.TranscriptApiFactory;
import jakarta.annotation.PostConstruct;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -15,6 +20,7 @@
public class MlController {

private static final Logger LOG = LoggerFactory.getLogger(MlController.class);
private final YoutubeTranscriptApi youtubeTranscriptApi = TranscriptApiFactory.createWithClient(new DefaultYoutubeClientCopy());

private final ChatClient chatClient;
private final MongoController mongo;
Expand Down Expand Up @@ -49,6 +55,17 @@ public void summarize() {
return;
}
LOG.info("Processing summarization queue of length: {}", queue.size());

processYoutubeTranscription(news);

summarizeNormalText(news);
} catch (Exception e) {
LOG.error("Error summarizing article", e);
}
}

private void summarizeNormalText(NewsEntry news) {
if (!news.getLink().startsWith("https://www.youtube.com/watch?v=")) {
news.setSummary(chatClient.prompt()
.system("You are a news reporter that summarizes news articles")
.user("Write an enganging summary of the following text, for publication in social media, the lenght of the summary should not be more than 3 paragraphs: \n\n" + news.getContent())
Expand All @@ -57,8 +74,30 @@ public void summarize() {
LOG.debug("Summary: {}", news.getSummary());
mongo.update(news);
LOG.info("Summarized the article: {}", news.getTitle());
} catch (Exception e) {
LOG.error("Error summarizing article", e);
}
}

private void processYoutubeTranscription(NewsEntry item) {
if (item.getLink().startsWith("https://www.youtube.com/watch?v=")) {
var videoId = item.getLink().substring(32);
try {
TranscriptList transcriptList = youtubeTranscriptApi.listTranscripts(videoId);
var fragments = transcriptList.findTranscript("en").fetch();
var content = TranscriptFormatters.textFormatter().format(fragments);
LOG.info("Transcript found for video: {}", item.getTitle());

item.setSummary(chatClient.prompt()
.system("You are a news reporter that summarizes news articles")
.user("Create a summary of the following youtube subtitles: \n\n" + content)
.call()
.content());
LOG.debug("Summary: {}", item.getSummary());
mongo.update(item);
LOG.info("Summarized the article: {}", item.getTitle());

} catch (TranscriptRetrievalException e) {
LOG.warn("No transcript found for video: {}", videoId);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.SyndFeedInput;
import com.rometools.rome.io.XmlReader;
import io.github.thoroldvix.api.TranscriptFormatters;
import io.github.thoroldvix.api.TranscriptList;
import io.github.thoroldvix.api.TranscriptRetrievalException;
import io.github.thoroldvix.api.YoutubeTranscriptApi;
import io.github.thoroldvix.internal.TranscriptApiFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Safelist;
Expand All @@ -33,7 +28,6 @@
public class RssProcessor {
private static final Logger LOG = LoggerFactory.getLogger(RssProcessor.class);
private final ZoneId zoneId = ZoneId.of("Europe/Berlin");
private final YoutubeTranscriptApi youtubeTranscriptApi = TranscriptApiFactory.createWithClient(new DefaultYoutubeClientCopy());

@Autowired
private MongoController mongo;
Expand Down Expand Up @@ -63,15 +57,15 @@ public void processRss(String url, String language) {
pushover.sendNotification(item);
LOG.debug("New entry: {}", item.getTitle());

processYoutubeTranscription(item);
if (item.getContent() != null) {
if (!item.getLink().startsWith("https://www.youtube.com")) { // don't HTML process youtube videos
item.setContent(processHtmlContent(item.getContent()));
}

mongo.update(item);
if (item.getContent().length() > 1000) {
if (item.getLink().startsWith("https://www.youtube.com")) {
mlController.queueSummarize(item);
} else { // PreProcess HTML content
item.setContent(processHtmlContent(item.getContent()));
mongo.update(item);
if (item.getContent().length() > 1000) {
mlController.queueSummarize(item);
}
}
}
}));
Expand All @@ -81,22 +75,6 @@ public void processRss(String url, String language) {
}
}

private void processYoutubeTranscription(NewsEntry item) {

if (item.getLink().startsWith("https://www.youtube.com/watch?v=")) {
var videoId = item.getLink().substring(32);
try {
TranscriptList transcriptList = youtubeTranscriptApi.listTranscripts(videoId);
var fragments = transcriptList.findTranscript("en").fetch();
var content = TranscriptFormatters.textFormatter().format(fragments);
LOG.info("Transcript found for video: {}", item.getTitle());
item.setContent(content);
} catch (TranscriptRetrievalException e) {
LOG.warn("No transcript found for video: {}", videoId);
}
}
}

private NewsEntry map(SyndEntry item) {
NewsEntry result = new NewsEntry();
String title = item.getTitle().strip();
Expand Down

0 comments on commit 941b528

Please sign in to comment.