Skip to content

Commit

Permalink
catch more presences JO (wip #108)
Browse files Browse the repository at this point in the history
  • Loading branch information
RouxRC committed Apr 21, 2018
1 parent c2ba746 commit 4981179
Showing 1 changed file with 109 additions and 99 deletions.
208 changes: 109 additions & 99 deletions batch/jo/parse_jo.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,17 @@ def date_iso(datestr):

reg = {}
reg['date'] = '^([0-9]{4})-([0-9]{2})-([0-9]{2})$'
reg['com'] = '^Commissions'
reg['com'] = '^(Commissions|Office parlementaire|)'
reg['start_an'] = u'^[0-9]{0,2}\.? *Membres présents ou excusés'
reg['start_senat'] = u'^Membres'
reg['commission'] = u'(.*) :$'
reg['reunion_an'] = u'^Réunion du (.*) ?à (.*) :'
reg['reunion_senat'] = u'^(.{1,5}éance) du (.*) :'
reg['reunion_senat_bis'] = u'^(.*), séance du (.*)$'
reg['presents'] = u'^Présents.* ?(-|:) (.*)'
reg['excuses'] = u'^Excusé.*(-|:) (.*)'
reg['presents'] = u'^Présents?.* ?(-|:) (.*)'
reg['presents_an'] = u'^Députés? [pP]résents?.* ?(-|:) (.*)'
reg['presents_senat'] = u'^Sénateurs? [pP]résents?.* ?(-|:) (.*)'
reg['excuses'] = u'^(?:Député|Sénateur)s?\s*[eE]xcusé.*(-|:) (.*)'
reg['assistent'] = u'^Assistai.* (-|:) (.*)'
reg['civilite'] = u' ?(Mme|M\.) '
reg['fonction_senat'] = u' \([^)]*\)'
Expand All @@ -65,10 +67,11 @@ def date_iso(datestr):

prefix = 'https://www.legifrance.gouv.fr'

texts_link = [u"Office parlementaire"]
if chamber == 'an':
text_link = u'Commissions et organes de contrôle'
texts_link.append(u'Commissions et organes de contrôle')
elif chamber == 'senat':
text_link = u'Commissions'
texts_link.append(u'Commissions')
else:
sys.exit('Le 1er argument doit être "an" ou "senat"')

Expand All @@ -93,122 +96,129 @@ def date_iso(datestr):
if soup.title.string.strip().startswith(u'Recherche'):
sys.exit(' no JO')

else:
# Sauvegarde sommaire
with open("html/sommaire_"+day+".html", "wb") as file:
file.write(soup.prettify("utf-8"))

data = {}
commission_link = False

for link in soup.find_all('a'):
link_string = unicode(link.string).strip()
if re.search(reg['com'], link_string, re.IGNORECASE) is not None:
# Sauvegarde sommaire
with open("html/sommaire_"+day+".html", "wb") as file:
file.write(soup.prettify("utf-8"))

data['source'] = 'Journal officiel du '+date_fr
data = {'source': 'Journal officiel du '+date_fr}
commission_link = False
json_file = ''

# Commission
if link_string == text_link:
commission_link = True
n_presences = 0
com_link = prefix+link.get('href')
coms_doc = urllib2.urlopen(com_link)
soup = BeautifulSoup(coms_doc.read(), "lxml")
for link in soup.find_all('a'):
link_string = unicode(link.string).strip()
if re.search(reg['com'], link_string, re.IGNORECASE) is not None:

# Sauvegarde commissions
with open("html/coms_"+chamber+"_"+day+".html", "wb") as file:
file.write(soup.prettify("utf-8"))
# Commission
if any([link_string.startswith(text_link) for text_link in texts_link]):
if link_string.startswith(u"Office parlementaire"):
data['commission'] = link_string
commission_link = True
n_presences = 0
com_link = prefix+link.get('href')
coms_doc = urllib2.urlopen(com_link)
soup = BeautifulSoup(coms_doc.read(), "lxml")

t = soup.find_all("div", "article")
# Sauvegarde commissions
with open("html/coms_"+chamber+"_"+day+".html", "wb") as file:
file.write(soup.prettify("utf-8"))

for br in t[0].findAll('br'):
br.replace_with(os.linesep)
t = soup.find_all("div", "article")

on = False
for br in t[0].findAll('br'):
br.replace_with(os.linesep)

com_text = ''
on = False

for line in t[0].get_text().split(os.linesep):
line = line.strip()
com_text = ''

# Détecter début
if re.search(reg['start_'+chamber], line, re.IGNORECASE) is not None:
on = True
for line in t[0].get_text().split(os.linesep):
line = line.strip().replace(u' ', ' ')

# Pre-process
if on and line:
if line.startswith(u'Présent') is False and line.startswith(u'Excusé') is False and line.startswith(u'Assistai') is False and line.startswith(u'Ont') is False and line.startswith(u'Les') is False and line.startswith(u'ERRATUM') is False and line.endswith(u' :') is False:
line = line+u' :'
# Détecter début
if re.search(reg['start_'+chamber], line, re.IGNORECASE) is not None:
on = True

if line.startswith(u'Les') and line.endswith(u' :'):
line = line[0:-2]
# Pre-process
if on and line:
if line.startswith(u'Présent') is False and line.startswith(u'Excusé') is False and line.startswith(u'Assistai') is False and line.startswith(u'Ont') is False and line.startswith(u'Les') is False and line.startswith(u'ERRATUM') is False and line.endswith(u' :') is False:
line = line+u' :'

com_text += line+os.linesep
if line.startswith(u'Les') and line.endswith(u' :'):
line = line[0:-2]

com_text = com_text.replace(u"’", u"'")
com_text += line+os.linesep

json_file = ''
com_text = com_text.replace(u"’", u"'")

for line in com_text.split(os.linesep):
for line in com_text.split(os.linesep):

#print >> sys.stderr, line
if re.search(reg['presents'], line, re.IGNORECASE) is not None:
m = re.search(reg['presents'], line, re.IGNORECASE)
presents = re.sub(reg['civilite'], "", m.group(2))
#print >> sys.stderr, line
m = re.search(reg['presents'], line, re.IGNORECASE)
if not m:
m = re.search(reg['presents_'+chamber], line)
if m:
print(m.groups())
presents = re.sub(reg['civilite'], "", m.group(2))

for present in presents.split(','):
if chamber == "senat":
data['senateur'] = present.strip().strip('.')
else:
data['depute'] = present.strip().strip('.')
json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
n_presences += 1

elif re.search(reg['assistent'], line, re.IGNORECASE) is not None:
m = re.search(reg['assistent'], line, re.IGNORECASE)
presents = re.sub(reg['civilite'], "", m.group(2))
for present in presents.split(','):
if chamber == "senat":
data['senateur'] = present.strip('. :')
else:
data['depute'] = present.strip('. :')
json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
n_presences += 1
elif re.search(reg['assistent'], line, re.IGNORECASE) is not None:
m = re.search(reg['assistent'], line, re.IGNORECASE)
presents = re.sub(reg['civilite'], "", m.group(2))
if chamber == "senat":
presents = re.sub(reg['fonction_senat'], "", presents, re.I)

for present in presents.split(','):
if chamber == "senat":
presents = re.sub(reg['fonction_senat'], "", presents)
data['senateur'] = present.strip('. :')
else:
data['depute'] = present.strip('. :')
json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
n_presences += 1

for present in presents.split(','):
if chamber == "senat":
data['senateur'] = present.strip().strip('.')
else:
data['depute'] = present.strip().strip('.')
json_file += json.dumps(data, separators=(',',':'), ensure_ascii=False, sort_keys=True)+os.linesep
n_presences += 1
elif (chamber == 'an' and re.search(reg['presents_senat'], line, re.I)) or (chamber == 'senat' and re.search(reg['presents_an'], line, re.I)):
pass
elif re.search(reg['excuses'], line) is not None:
pass
elif re.search(reg['commission'], line) is not None:

elif re.search(reg['excuses'], line) is not None:
if re.search(reg['start_senat'], line, re.IGNORECASE):
pass
elif re.search(reg['commission'], line) is not None:

if re.search(reg['reunion_an'], line, re.IGNORECASE) is not None:
m = re.search(reg['reunion_an'], line, re.IGNORECASE)
data['reunion'] = date_iso(m.group(1))
data['session'] = m.group(2).replace(' :', '').replace(' h ', ':').replace(' heures', ':00')[0:5]
elif re.search(reg['reunion_senat'], line, re.IGNORECASE) is not None:
m = re.search(reg['reunion_senat'], line, re.IGNORECASE)
data['date'] = date_iso(m.group(2))
data['heure'] = m.group(1).replace(u'Séance', '')
else:
m = re.search(reg['commission'], line)
data['commission'] = re.sub(':', '', m.group(1)).strip()

if chamber == "senat" and re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE):
m = re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE)
data['date'] = date_iso(m.group(2))
data['heure'] = ''
data['commission'] = m.group(1)

if not json_file:
sys.exit(' no attendance '+com_link)
else:
sys.stderr.write(str(n_presences)+' présences '+com_link+'\n')
if stdout:
print(json_file.strip().encode('utf-8'))
elif re.search(reg['reunion_an'], line, re.IGNORECASE) is not None:
m = re.search(reg['reunion_an'], line, re.IGNORECASE)
data['reunion'] = date_iso(m.group(1))
data['session'] = m.group(2).replace(' :', '').replace(' h ', ':').replace(' heures', ':00')[0:5]
elif re.search(reg['reunion_senat'], line, re.IGNORECASE) is not None:
m = re.search(reg['reunion_senat'], line, re.IGNORECASE)
data['date'] = date_iso(m.group(2))
data['heure'] = m.group(1).replace(u'Séance', '')
else:
with open("json/"+chamber+"_"+day+".json", "wb") as file:
file.write(json_file.strip().encode('utf-8'))
m = re.search(reg['commission'], line)
data['commission'] = re.sub(':', '', m.group(1)).strip()

if chamber == "senat" and re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE):
m = re.search(reg['reunion_senat_bis'], data['commission'], re.IGNORECASE)
data['date'] = date_iso(m.group(2))
data['heure'] = ''
data['commission'] = m.group(1)

if not n_presences:
sys.exit(' no attendance '+com_link)
else:
sys.stderr.write(str(n_presences)+' présences '+com_link+'\n')
data['commission'] = ""

if json_file:
if stdout:
print(json_file.strip().encode('utf-8'))
else:
with open("json/"+chamber+"_"+day+".json", "wb") as file:
file.write(json_file.strip().encode('utf-8'))

if not commission_link:
sys.exit(' no commission ')
Expand Down

0 comments on commit 4981179

Please sign in to comment.