diff --git a/batch/jo/parse_jo.py b/batch/jo/parse_jo.py index 116870d6e..985e6c281 100644 --- a/batch/jo/parse_jo.py +++ b/batch/jo/parse_jo.py @@ -35,15 +35,17 @@ def date_iso(datestr): reg = {} reg['date'] = '^([0-9]{4})-([0-9]{2})-([0-9]{2})$' -reg['com'] = '^Commissions' +reg['com'] = '^(Commissions|Office parlementaire|)' reg['start_an'] = u'^[0-9]{0,2}\.? *Membres présents ou excusés' reg['start_senat'] = u'^Membres' reg['commission'] = u'(.*) :$' reg['reunion_an'] = u'^Réunion du (.*) ?à (.*) :' reg['reunion_senat'] = u'^(.{1,5}éance) du (.*) :' reg['reunion_senat_bis'] = u'^(.*), séance du (.*)$' -reg['presents'] = u'^Présents.* ?(-|:) (.*)' -reg['excuses'] = u'^Excusé.*(-|:) (.*)' +reg['presents'] = u'^Présents?.* ?(-|:) (.*)' +reg['presents_an'] = u'^Députés? [pP]résents?.* ?(-|:) (.*)' +reg['presents_senat'] = u'^Sénateurs? [pP]résents?.* ?(-|:) (.*)' +reg['excuses'] = u'^(?:Député|Sénateur)s?\s*[eE]xcusé.*(-|:) (.*)' reg['assistent'] = u'^Assistai.* (-|:) (.*)' reg['civilite'] = u' ?(Mme|M\.) ' reg['fonction_senat'] = u' \([^)]*\)' @@ -65,10 +67,11 @@ def date_iso(datestr): prefix = 'https://www.legifrance.gouv.fr' +texts_link = [u"Office parlementaire"] if chamber == 'an': - text_link = u'Commissions et organes de contrôle' + texts_link.append(u'Commissions et organes de contrôle') elif chamber == 'senat': - text_link = u'Commissions' + texts_link.append(u'Commissions') else: sys.exit('Le 1er argument doit être "an" ou "senat"') @@ -93,122 +96,129 @@ def date_iso(datestr): if soup.title.string.strip().startswith(u'Recherche'): sys.exit(' no JO') -else: - # Sauvegarde sommaire - with open("html/sommaire_"+day+".html", "wb") as file: - file.write(soup.prettify("utf-8")) - - data = {} - commission_link = False - - for link in soup.find_all('a'): - link_string = unicode(link.string).strip() - if re.search(reg['com'], link_string, re.IGNORECASE) is not None: +# Sauvegarde sommaire +with open("html/sommaire_"+day+".html", "wb") as file: + file.write(soup.prettify("utf-8")) - data['source'] = 'Journal officiel du '+date_fr +data = {'source': 'Journal officiel du '+date_fr} +commission_link = False +json_file = '' - # Commission - if link_string == text_link: - commission_link = True - n_presences = 0 - com_link = prefix+link.get('href') - coms_doc = urllib2.urlopen(com_link) - soup = BeautifulSoup(coms_doc.read(), "lxml") +for link in soup.find_all('a'): + link_string = unicode(link.string).strip() + if re.search(reg['com'], link_string, re.IGNORECASE) is not None: - # Sauvegarde commissions - with open("html/coms_"+chamber+"_"+day+".html", "wb") as file: - file.write(soup.prettify("utf-8")) + # Commission + if any([link_string.startswith(text_link) for text_link in texts_link]): + if link_string.startswith(u"Office parlementaire"): + data['commission'] = link_string + commission_link = True + n_presences = 0 + com_link = prefix+link.get('href') + coms_doc = urllib2.urlopen(com_link) + soup = BeautifulSoup(coms_doc.read(), "lxml") - t = soup.find_all("div", "article") + # Sauvegarde commissions + with open("html/coms_"+chamber+"_"+day+".html", "wb") as file: + file.write(soup.prettify("utf-8")) - for br in t[0].findAll('br'): - br.replace_with(os.linesep) + t = soup.find_all("div", "article") - on = False + for br in t[0].findAll('br'): + br.replace_with(os.linesep) - com_text = '' + on = False - for line in t[0].get_text().split(os.linesep): - line = line.strip() + com_text = '' - # Détecter début - if re.search(reg['start_'+chamber], line, re.IGNORECASE) is not None: - on = True + for line in t[0].get_text().split(os.linesep): + line = line.strip().replace(u' ', ' ') - # Pre-process - if on and line: - if line.startswith(u'Présent') is False and line.startswith(u'Excusé') is False and line.startswith(u'Assistai') is False and line.startswith(u'Ont') is False and line.startswith(u'Les') is False and line.startswith(u'ERRATUM') is False and line.endswith(u' :') is False: - line = line+u' :' + # Détecter début + if re.search(reg['start_'+chamber], line, re.IGNORECASE) is not None: + on = True - if line.startswith(u'Les') and line.endswith(u' :'): - line = line[0:-2] + # Pre-process + if on and line: + if line.startswith(u'Présent') is False and line.startswith(u'Excusé') is False and line.startswith(u'Assistai') is False and line.startswith(u'Ont') is False and line.startswith(u'Les') is False and line.startswith(u'ERRATUM') is False and line.endswith(u' :') is False: + line = line+u' :' - com_text += line+os.linesep + if line.startswith(u'Les') and line.endswith(u' :'): + line = line[0:-2] - com_text = com_text.replace(u"’", u"'") + com_text += line+os.linesep - json_file = '' + com_text = com_text.replace(u"’", u"'") - for line in com_text.split(os.linesep): + for line in com_text.split(os.linesep): - #print >> sys.stderr, line - if re.search(reg['presents'], line, re.IGNORECASE) is not None: - m = re.search(reg['presents'], line, re.IGNORECASE) - presents = re.sub(reg['civilite'], "", m.group(2)) + #print >> sys.stderr, line + m = re.search(reg['presents'], line, re.IGNORECASE) + if not m: + m = re.search(reg['presents_'+chamber], line) + if m: + print(m.groups()) + presents = re.sub(reg['civilite'], "", m.group(2)) - for present in presents.split(','): - if chamber == "senat": - data['senateur'] = present.strip().strip('.') - else: - data['depute'] = present.strip().strip('.') - json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep - n_presences += 1 - - elif re.search(reg['assistent'], line, re.IGNORECASE) is not None: - m = re.search(reg['assistent'], line, re.IGNORECASE) - presents = re.sub(reg['civilite'], "", m.group(2)) + for present in presents.split(','): + if chamber == "senat": + data['senateur'] = present.strip('. :') + else: + data['depute'] = present.strip('. :') + json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep + n_presences += 1 + elif re.search(reg['assistent'], line, re.IGNORECASE) is not None: + m = re.search(reg['assistent'], line, re.IGNORECASE) + presents = re.sub(reg['civilite'], "", m.group(2)) + if chamber == "senat": + presents = re.sub(reg['fonction_senat'], "", presents, re.I) + + for present in presents.split(','): if chamber == "senat": - presents = re.sub(reg['fonction_senat'], "", presents) + data['senateur'] = present.strip('. :') + else: + data['depute'] = present.strip('. :') + json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep + n_presences += 1 - for present in presents.split(','): - if chamber == "senat": - data['senateur'] = present.strip().strip('.') - else: - data['depute'] = present.strip().strip('.') - json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep - n_presences += 1 + elif (chamber == 'an' and re.search(reg['presents_senat'], line, re.I)) or (chamber == 'senat' and re.search(reg['presents_an'], line, re.I)): + pass + elif re.search(reg['excuses'], line) is not None: + pass + elif re.search(reg['commission'], line) is not None: - elif re.search(reg['excuses'], line) is not None: + if re.search(reg['start_senat'], line, re.IGNORECASE): pass - elif re.search(reg['commission'], line) is not None: - - if re.search(reg['reunion_an'], line, re.IGNORECASE) is not None: - m = re.search(reg['reunion_an'], line, re.IGNORECASE) - data['reunion'] = date_iso(m.group(1)) - data['session'] = m.group(2).replace(' :', '').replace(' h ', ':').replace(' heures', ':00')[0:5] - elif re.search(reg['reunion_senat'], line, re.IGNORECASE) is not None: - m = re.search(reg['reunion_senat'], line, re.IGNORECASE) - data['date'] = date_iso(m.group(2)) - data['heure'] = m.group(1).replace(u'Séance', '') - else: - m = re.search(reg['commission'], line) - data['commission'] = re.sub(':', '', m.group(1)).strip() - - if chamber == "senat" and re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE): - m = re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE) - data['date'] = date_iso(m.group(2)) - data['heure'] = '' - data['commission'] = m.group(1) - - if not json_file: - sys.exit(' no attendance '+com_link) - else: - sys.stderr.write(str(n_presences)+' présences '+com_link+'\n') - if stdout: - print(json_file.strip().encode('utf-8')) + elif re.search(reg['reunion_an'], line, re.IGNORECASE) is not None: + m = re.search(reg['reunion_an'], line, re.IGNORECASE) + data['reunion'] = date_iso(m.group(1)) + data['session'] = m.group(2).replace(' :', '').replace(' h ', ':').replace(' heures', ':00')[0:5] + elif re.search(reg['reunion_senat'], line, re.IGNORECASE) is not None: + m = re.search(reg['reunion_senat'], line, re.IGNORECASE) + data['date'] = date_iso(m.group(2)) + data['heure'] = m.group(1).replace(u'Séance', '') else: - with open("json/"+chamber+"_"+day+".json", "wb") as file: - file.write(json_file.strip().encode('utf-8')) + m = re.search(reg['commission'], line) + data['commission'] = re.sub(':', '', m.group(1)).strip() + + if chamber == "senat" and re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE): + m = re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE) + data['date'] = date_iso(m.group(2)) + data['heure'] = '' + data['commission'] = m.group(1) + + if not n_presences: + sys.exit(' no attendance '+com_link) + else: + sys.stderr.write(str(n_presences)+' présences '+com_link+'\n') + data['commission'] = "" + +if json_file: + if stdout: + print(json_file.strip().encode('utf-8')) + else: + with open("json/"+chamber+"_"+day+".json", "wb") as file: + file.write(json_file.strip().encode('utf-8')) if not commission_link: sys.exit(' no commission ')