catch more presences JO (wip #108)

regardscitoyens · Apr 21, 2018 · 4981179 · 4981179
1 parent c2ba746
commit 4981179
Showing 1 changed file with 109 additions and 99 deletions.
diff --git a/batch/jo/parse_jo.py b/batch/jo/parse_jo.py
@@ -35,15 +35,17 @@ def date_iso(datestr):
 
 reg = {}
 reg['date'] = '^([0-9]{4})-([0-9]{2})-([0-9]{2})$'
-reg['com'] = '^Commissions'
+reg['com'] = '^(Commissions|Office parlementaire|)'
 reg['start_an'] = u'^[0-9]{0,2}\.? *Membres présents ou excusés'
 reg['start_senat'] = u'^Membres'
 reg['commission'] = u'(.*) :$'
 reg['reunion_an'] = u'^Réunion du (.*) ?à (.*) :'
 reg['reunion_senat'] = u'^(.{1,5}éance) du (.*) :'
 reg['reunion_senat_bis'] = u'^(.*), séance du (.*)$'
-reg['presents'] = u'^Présents.* ?(-|:) (.*)'
-reg['excuses'] = u'^Excusé.*(-|:) (.*)'
+reg['presents'] = u'^Présents?.* ?(-|:) (.*)'
+reg['presents_an'] = u'^Députés? [pP]résents?.* ?(-|:) (.*)'
+reg['presents_senat'] = u'^Sénateurs? [pP]résents?.* ?(-|:) (.*)'
+reg['excuses'] = u'^(?:Député|Sénateur)s?\s*[eE]xcusé.*(-|:) (.*)'
 reg['assistent'] = u'^Assistai.* (-|:) (.*)'
 reg['civilite'] = u' ?(Mme|M\.) '
 reg['fonction_senat'] = u' \([^)]*\)'
@@ -65,10 +67,11 @@ def date_iso(datestr):
 
 prefix = 'https://www.legifrance.gouv.fr'
 
+texts_link = [u"Office parlementaire"]
 if chamber == 'an':
-  text_link = u'Commissions et organes de contrôle'
+  texts_link.append(u'Commissions et organes de contrôle')
 elif chamber == 'senat':
-  text_link = u'Commissions'
+  texts_link.append(u'Commissions')
 else:
   sys.exit('Le 1er argument doit être "an" ou "senat"')
 
@@ -93,122 +96,129 @@ def date_iso(datestr):
 if soup.title.string.strip().startswith(u'Recherche'):
   sys.exit(' no JO')
 
-else:
-  # Sauvegarde sommaire
-  with open("html/sommaire_"+day+".html", "wb") as file:
-    file.write(soup.prettify("utf-8"))
-
-  data = {}
-  commission_link = False
-
-  for link in soup.find_all('a'):
-    link_string = unicode(link.string).strip()
-    if re.search(reg['com'], link_string, re.IGNORECASE) is not None:
+# Sauvegarde sommaire
+with open("html/sommaire_"+day+".html", "wb") as file:
+  file.write(soup.prettify("utf-8"))
 
-      data['source'] = 'Journal officiel du '+date_fr
+data = {'source': 'Journal officiel du '+date_fr}
+commission_link = False
+json_file = ''
 
-      # Commission
-      if link_string == text_link:
-        commission_link = True
-        n_presences = 0
-        com_link = prefix+link.get('href')
-        coms_doc = urllib2.urlopen(com_link)
-        soup = BeautifulSoup(coms_doc.read(), "lxml")
+for link in soup.find_all('a'):
+  link_string = unicode(link.string).strip()
+  if re.search(reg['com'], link_string, re.IGNORECASE) is not None:
 
-        # Sauvegarde commissions
-        with open("html/coms_"+chamber+"_"+day+".html", "wb") as file:
-          file.write(soup.prettify("utf-8"))
+    # Commission
+    if any([link_string.startswith(text_link) for text_link in texts_link]):
+      if link_string.startswith(u"Office parlementaire"):
+        data['commission'] = link_string
+      commission_link = True
+      n_presences = 0
+      com_link = prefix+link.get('href')
+      coms_doc = urllib2.urlopen(com_link)
+      soup = BeautifulSoup(coms_doc.read(), "lxml")
 
-        t = soup.find_all("div", "article")
+      # Sauvegarde commissions
+      with open("html/coms_"+chamber+"_"+day+".html", "wb") as file:
+        file.write(soup.prettify("utf-8"))
 
-        for br in t[0].findAll('br'):
-          br.replace_with(os.linesep)
+      t = soup.find_all("div", "article")
 
-        on = False
+      for br in t[0].findAll('br'):
+        br.replace_with(os.linesep)
 
-        com_text = ''
+      on = False
 
-        for line in t[0].get_text().split(os.linesep):
-          line = line.strip()
+      com_text = ''
 
-          # Détecter début
-          if re.search(reg['start_'+chamber], line, re.IGNORECASE) is not None:
-            on = True
+      for line in t[0].get_text().split(os.linesep):
+        line = line.strip().replace(u' ', ' ')
 
-          # Pre-process
-          if on and line:
-            if line.startswith(u'Présent') is False and line.startswith(u'Excusé') is False and line.startswith(u'Assistai') is False and line.startswith(u'Ont') is False and line.startswith(u'Les') is False and line.startswith(u'ERRATUM') is False and line.endswith(u' :') is False:
-              line = line+u' :'
+        # Détecter début
+        if re.search(reg['start_'+chamber], line, re.IGNORECASE) is not None:
+          on = True
 
-            if line.startswith(u'Les') and line.endswith(u' :'):
-              line = line[0:-2]
+        # Pre-process
+        if on and line:
+          if line.startswith(u'Présent') is False and line.startswith(u'Excusé') is False and line.startswith(u'Assistai') is False and line.startswith(u'Ont') is False and line.startswith(u'Les') is False and line.startswith(u'ERRATUM') is False and line.endswith(u' :') is False:
+            line = line+u' :'
 
-            com_text += line+os.linesep
+          if line.startswith(u'Les') and line.endswith(u' :'):
+            line = line[0:-2]
 
-        com_text = com_text.replace(u"’", u"'")
+          com_text += line+os.linesep
 
-        json_file = ''
+      com_text = com_text.replace(u"’", u"'")
 
-        for line in com_text.split(os.linesep):
+      for line in com_text.split(os.linesep):
 
-          #print >> sys.stderr, line
-          if re.search(reg['presents'], line, re.IGNORECASE) is not None:
-            m = re.search(reg['presents'], line, re.IGNORECASE)
-            presents = re.sub(reg['civilite'], "", m.group(2))
+        #print >> sys.stderr, line
+        m = re.search(reg['presents'], line, re.IGNORECASE)
+        if not m:
+          m = re.search(reg['presents_'+chamber], line)
+        if m:
+          print(m.groups())
+          presents = re.sub(reg['civilite'], "", m.group(2))
 
-            for present in presents.split(','):
-              if chamber == "senat":
-                data['senateur'] = present.strip().strip('.')
-              else:
-                data['depute'] = present.strip().strip('.')
-              json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
-              n_presences += 1
-
-          elif re.search(reg['assistent'], line, re.IGNORECASE) is not None:
-            m = re.search(reg['assistent'], line, re.IGNORECASE)
-            presents = re.sub(reg['civilite'], "", m.group(2))
+          for present in presents.split(','):
+            if chamber == "senat":
+              data['senateur'] = present.strip('. :')
+            else:
+              data['depute'] = present.strip('. :')
+            json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
+            n_presences += 1
+        elif re.search(reg['assistent'], line, re.IGNORECASE) is not None:
+          m = re.search(reg['assistent'], line, re.IGNORECASE)
+          presents = re.sub(reg['civilite'], "", m.group(2))
+          if chamber == "senat":
+            presents = re.sub(reg['fonction_senat'], "", presents, re.I)
+
+          for present in presents.split(','):
             if chamber == "senat":
-              presents = re.sub(reg['fonction_senat'], "", presents)
+              data['senateur'] = present.strip('. :')
+            else:
+              data['depute'] = present.strip('. :')
+            json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
+            n_presences += 1
 
-            for present in presents.split(','):
-              if chamber == "senat":
-                data['senateur'] = present.strip().strip('.')
-              else:
-                data['depute'] = present.strip().strip('.')
-              json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
-              n_presences += 1
+        elif (chamber == 'an' and re.search(reg['presents_senat'], line, re.I)) or (chamber == 'senat' and re.search(reg['presents_an'], line, re.I)):
+          pass
+        elif re.search(reg['excuses'], line) is not None:
+          pass
+        elif re.search(reg['commission'], line) is not None:
 
-          elif re.search(reg['excuses'], line) is not None:
+          if re.search(reg['start_senat'], line, re.IGNORECASE):
             pass
-          elif re.search(reg['commission'], line) is not None:
-
-            if re.search(reg['reunion_an'], line, re.IGNORECASE) is not None:
-              m = re.search(reg['reunion_an'], line, re.IGNORECASE)
-              data['reunion'] = date_iso(m.group(1))
-              data['session'] = m.group(2).replace(' :', '').replace(' h ', ':').replace(' heures', ':00')[0:5]
-            elif re.search(reg['reunion_senat'], line, re.IGNORECASE) is not None:
-              m = re.search(reg['reunion_senat'], line, re.IGNORECASE)
-              data['date'] = date_iso(m.group(2))
-              data['heure'] = m.group(1).replace(u'Séance', '')
-            else:
-              m = re.search(reg['commission'], line)
-              data['commission'] = re.sub(':', '', m.group(1)).strip()
-
-              if chamber == "senat" and re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE):
-                m = re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE)
-                data['date'] = date_iso(m.group(2))
-                data['heure'] = ''
-                data['commission'] = m.group(1)
-
-        if not json_file:
-          sys.exit(' no attendance '+com_link)
-        else:
-          sys.stderr.write(str(n_presences)+' présences '+com_link+'\n')
-          if stdout:
-            print(json_file.strip().encode('utf-8'))
+          elif re.search(reg['reunion_an'], line, re.IGNORECASE) is not None:
+            m = re.search(reg['reunion_an'], line, re.IGNORECASE)
+            data['reunion'] = date_iso(m.group(1))
+            data['session'] = m.group(2).replace(' :', '').replace(' h ', ':').replace(' heures', ':00')[0:5]
+          elif re.search(reg['reunion_senat'], line, re.IGNORECASE) is not None:
+            m = re.search(reg['reunion_senat'], line, re.IGNORECASE)
+            data['date'] = date_iso(m.group(2))
+            data['heure'] = m.group(1).replace(u'Séance', '')
           else:
-            with open("json/"+chamber+"_"+day+".json", "wb") as file:
-              file.write(json_file.strip().encode('utf-8'))
+            m = re.search(reg['commission'], line)
+            data['commission'] = re.sub(':', '', m.group(1)).strip()
+
+            if chamber == "senat" and re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE):
+              m = re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE)
+              data['date'] = date_iso(m.group(2))
+              data['heure'] = ''
+              data['commission'] = m.group(1)
+
+      if not n_presences:
+        sys.exit(' no attendance '+com_link)
+      else:
+        sys.stderr.write(str(n_presences)+' présences '+com_link+'\n')
+      data['commission'] = ""
+
+if json_file:
+  if stdout:
+    print(json_file.strip().encode('utf-8'))
+  else:
+    with open("json/"+chamber+"_"+day+".json", "wb") as file:
+      file.write(json_file.strip().encode('utf-8'))
 
 if not commission_link:
   sys.exit(' no commission ')