cleantitle.py: option to forcibly remove anything that looks like HTML
authorJack Miller <jack@codezen.org>
Wed, 8 Jul 2015 17:05:09 +0000 (12:05 -0500)
committerJack Miller <jack@codezen.org>
Wed, 8 Jul 2015 17:05:09 +0000 (12:05 -0500)
plugins/cleantitle.py

index 2314a19..7c3f994 100644 (file)
@@ -19,12 +19,35 @@ replacements = [
     ("<nobr />", ""),
 ]
 
+# Also included, the option to forcibly remove anything that looks like HTML.
+# Useful for cleaning horribly formatted feeds, but possibly destructive of
+# good content wrapped in <>. Set this to True to enable.
+
+NO_HTML_EVER = False
+
 from canto_next.plugins import check_program
 
 check_program("canto-curses")
 
 from canto_curses.story import StoryPlugin
 
+# From kjellgren (canto-curses issue #18)
+
+def remove_html_markup(s):
+    tag = False
+    quote = False
+    out = ""
+    for c in s:
+        if c == '<' and not quote:
+            tag = True
+        elif c == '>' and not quote:
+            tag = False
+        elif (c == '"' or c == "'") and tag:
+            quote = not quote
+        elif not tag:
+            out = out + c
+    return out
+
 class CleanTitle(StoryPlugin):
     def __init__(self, story):
         self.story = story
@@ -36,4 +59,7 @@ class CleanTitle(StoryPlugin):
         for o,n in replacements:
             t = t.replace(o, n)
 
+        if NO_HTML_EVER:
+            t = remove_html_markup(t)
+
         self.story.content["title"] = t