---
simplynews_sites/lefigaro.py | 134 ++++++++++++++++++++---------------
1 file changed, 78 insertions(+), 56 deletions(-)
diff --git a/simplynews_sites/lefigaro.py b/simplynews_sites/lefigaro.py
index 10ad397..6aad359 100644
--- a/simplynews_sites/lefigaro.py
+++ b/simplynews_sites/lefigaro.py
@@ -38,15 +38,16 @@ def get_page(url):
post = soup.select_one("article")
- json_element = soup.find("script", type="application/ld+json")
- if json_element is not None:
- info_json = json.loads(json_element.next)
-
- if subtitle.endswith("..."):
+ if subtitle.endswith("...") and post is not None:
+ # None when "story" page (or unknown)
standfirst = post.select_one("p.fig-standfirst")
if standfirst is not None:
subtitle = standfirst.text
+ json_element = soup.find("script", type="application/ld+json")
+ if json_element is not None:
+ info_json = json.loads(json_element.next)
+
for element in info_json:
if element["@type"] == "NewsArticle":
last_updated = element.get("dateModified")
@@ -75,70 +76,88 @@ def get_page(url):
article = []
- heading_image = post.select_one(
- "article > figure.fig-media img") or soup.select_one("div.fig-wrapper figure.fig-media img")
- if heading_image is not None:
- article.append(get_image(heading_image))
-
- post_content = post.select_one("div.fig-content-body")
-
- if post_content is None: # not a regular article
- poll_element = post.select_one("div.fig-poll") # poll "article"
- if poll_element is not None:
- entries = []
- results = poll_element.select("div.fig-poll__result")
- for result in results:
- percentage = result.get("data-percentage")
- label = result.select_one("span.fig-poll__label").text
+ if post is None:
+ # "story" page (or unknown)
+ amp_video = soup.select_one("amp-video")
+ if amp_video is not None:
+ thumbnail_src = utils.get_property(amp_video, "poster")
+ sources = amp_video.select("source") or []
+ # DASH (mpd) and HLS (m3u8) are also provided for the same video
+ # but we only select mp4 because it's more widely supported
+ for source in sources:
+ src = utils.get_property(source, "src")
+ if utils.get_property(source, "type") == "video/mp4":
+ article.append({
+ "type": "video",
+ "src": src,
+ "poster": thumbnail_src
+ })
+ else:
+ heading_image = post.select_one(
+ "article > figure.fig-media img") or soup.select_one("div.fig-wrapper figure.fig-media img")
+ if heading_image is not None:
+ article.append(get_image(heading_image))
- entries.append({"value": "{} : {}%".format(label, percentage)})
+ post_content = post.select_one("div.fig-content-body")
- article.append({
- "type": "unsorted list",
- "entries": entries
- })
+ if post_content is None: # not a regular article
+ poll_element = post.select_one("div.fig-poll") # poll
+ if poll_element is not None:
+ entries = []
+ results = poll_element.select("div.fig-poll__result")
+ for result in results:
+ percentage = result.get("data-percentage")
+ label = result.select_one("span.fig-poll__label").text
- votes = poll_element.get("data-voters")
+ entries.append({"value": "{} : {}%".format(label, percentage)})
- article.append({
- "type": "paragraph",
- "value": "{} votes".format(votes)
- })
+ article.append({
+ "type": "unsorted list",
+ "entries": entries
+ })
- data["article"] = article
- return data
+ votes = poll_element.get("data-voters")
- live_messages = post.select("article.live-message") # live "article"
- if live_messages is not None:
- for message in live_messages:
- message_title = message.select_one(".live-title")
article.append({
- "type": "header",
- "size": "h2",
- "value": message_title.text
+ "type": "paragraph",
+ "value": "{} votes".format(votes)
})
- date = message.select_one("time")
- if date is not None:
- # date_time = datetime.fromisoformat(date["datetime"])
+
+ data["article"] = article
+ return data
+
+ live_messages = post.select("article.live-message")
+ # live "article"
+ if live_messages is not None:
+ for message in live_messages:
+ message_title = message.select_one(".live-title")
article.append({
- "type": "paragraph",
- "value": "Publié {}".format(date.text)
+ "type": "header",
+ "size": "h2",
+ "value": message_title.text
})
+ date = message.select_one("time")
+ if date is not None:
+ # date_time = datetime.fromisoformat(date["datetime"])
+ article.append({
+ "type": "paragraph",
+ "value": "Publié {}".format(date.text)
+ })
+
+ message_body = message.select_one("div.live-article")
+ for element in message_body:
+ el = get_element(element, True)
+ if el is not None and el != {}:
+ article.append(el)
- message_body = message.select_one("div.live-article")
- for element in message_body:
- el = get_element(element, True)
- if el is not None and el != {}:
- article.append(el)
-
- data["article"] = article
- return data
+ data["article"] = article
+ return data
- for element in post_content:
- el = get_element(element)
+ for element in post_content:
+ el = get_element(element)
- if el is not None and el != {}:
- article.append(el)
+ if el is not None and el != {}:
+ article.append(el)
data["article"] = article
return data
@@ -237,6 +256,9 @@ if __name__ == "__main__":
# page_url = "sciences/en-direct-covid-19-les-alpes-maritimes-attendent-les-decisions-du-gouvernement-20210222"
# "live" article
+ # page_url = "story/gilles-kepel--la-victoire-des-talibans-est-une-bonne-nouvelle-pour-les-islamistes-sunnites-dans-le-monde-14454"
+ # "story"
+
page_url = "confinement-partiel-commerces-ce-qu-il-faut-retenir-des-mesures-de-restriction-dans-les-alpes-maritimes-20210222"
# multiple authors
--
2.31.1