Improved feed update, proper handling of duplicate items.

This may cause issues with feeds which do not provide item IDs. If this becomes an issue (feel free to submit one), I'll add automatic generation of unique IDs.
runtakun · Apr 23, 2013 · 0099aa7 · 0099aa7
1 parent 7fa8fbc
commit 0099aa7
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 22 deletions.
diff --git a/atom.go b/atom.go
@@ -50,7 +50,18 @@ func parseAtom(data []byte, read *db) (*Feed, error) {
 		next.ID = item.ID
 		next.Read = false
 
+		if next.ID == "" {
+			fmt.Printf("Warning: Item %q has no ID and will be ignored.\n", next.Title)
+			continue
+		}
+
+		if _, ok := out.ItemMap[next.ID]; ok {
+			fmt.Printf("Warning: Item %q has duplicate ID.\n", next.Title)
+			continue
+		}
+
 		out.Items = append(out.Items, next)
+		out.ItemMap[next.ID] = struct{}{}
 		out.Unread++
 	}
 

diff --git a/rss 1.0.go b/rss 1.0.go
@@ -82,7 +82,18 @@ func parseRSS1(data []byte, read *db) (*Feed, error) {
 		next.ID = item.ID
 		next.Read = false
 
+		if next.ID == "" {
+			fmt.Printf("Warning: Item %q has no ID and will be ignored.\n", next.Title)
+			continue
+		}
+
+		if _, ok := out.ItemMap[next.ID]; ok {
+			fmt.Printf("Warning: Item %q has duplicate ID.\n", next.Title)
+			continue
+		}
+
 		out.Items = append(out.Items, next)
+		out.ItemMap[next.ID] = struct{}{}
 		out.Unread++
 	}
 

diff --git a/rss 2.0.go b/rss 2.0.go
@@ -60,6 +60,7 @@ func parseRSS2(data []byte, read *db) (*Feed, error) {
 	}
 
 	out.Items = make([]*Item, 0, len(channel.Items))
+	out.ItemMap = make(map[string]struct{})
 
 	// Process items.
 	for _, item := range channel.Items {
@@ -82,7 +83,18 @@ func parseRSS2(data []byte, read *db) (*Feed, error) {
 		next.ID = item.ID
 		next.Read = false
 
+		if next.ID == "" {
+			fmt.Printf("Warning: Item %q has no ID and will be ignored.\n", next.Title)
+			continue
+		}
+
+		if _, ok := out.ItemMap[next.ID]; ok {
+			fmt.Printf("Warning: Item %q has duplicate ID.\n", next.Title)
+			continue
+		}
+
 		out.Items = append(out.Items, next)
+		out.ItemMap[next.ID] = struct{}{}
 		out.Unread++
 	}
 

diff --git a/rss.go b/rss.go
@@ -12,6 +12,7 @@ import (
 
 // Parse RSS or Atom data.
 func Parse(data []byte) (*Feed, error) {
+
   if strings.Contains(string(data), "<rss") {
     return parseRSS2(data, database)
   } else if strings.Contains(string(data), "xmlns=\"http://purl.org/rss/1.0/\"") {
@@ -37,15 +38,15 @@ func Fetch(url string) (*Feed, error) {
   }
 
   out, err := Parse(body)
-	if err != nil {
-		return nil, err
-	}
-	
-	if out.Link == "" {
-		out.Link = url
-	}
-	
-	return out, nil
+  if err != nil {
+    return nil, err
+  }
+
+  if out.Link == "" {
+    out.Link = url
+  }
+
+  return out, nil
 }
 
 // Feed is the top-level structure.
@@ -55,6 +56,7 @@ type Feed struct {
   Link        string
   Image       *Image
   Items       []*Item
+  ItemMap     map[string]struct{}
   Refresh     time.Time
   Unread      uint32
 }
@@ -71,6 +73,10 @@ func (f *Feed) Update() error {
     return errors.New("Error: feed has no URL.")
   }
 
+  if f.ItemMap == nil {
+    return errors.New("Error: Feed has no ItemMap.")
+  }
+
   update, err := Fetch(f.Link)
   if err != nil {
     return err
@@ -80,21 +86,11 @@ func (f *Feed) Update() error {
   f.Title = update.Title
   f.Description = update.Description
 
-  // Find the offset between items.
-  offset := 0
-  for _, item := range f.Items {
-    if item.ID == update.Items[0].ID {
-      break
-    }
-    offset++
-  }
-
-  for i, item := range update.Items {
-    if i+offset >= len(f.Items) {
+  for _, item := range update.Items {
+    if _, ok := f.ItemMap[item.ID]; !ok {
       f.Items = append(f.Items, item)
+      f.ItemMap[item.ID] = struct{}{}
       f.Unread++
-    } else if f.Items[i+offset].ID != item.ID {
-      return errors.New("Error: offsets don't match.")
     }
   }