fix #1706 marmiton and 750g are now working using browser2

This commit is contained in:
Bezleputh 2015-01-19 15:07:04 +01:00 committed by Romain Bignon
commit 8250846448
9 changed files with 151 additions and 260 deletions

View file

@ -18,33 +18,25 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser, BrowserHTTPNotFound from weboob.browser import PagesBrowser, URL
from .pages import RecipePage, ResultsPage from .pages import RecipePage, ResultsPage
__all__ = ['SevenFiftyGramsBrowser'] __all__ = ['SevenFiftyGramsBrowser']
class SevenFiftyGramsBrowser(Browser): class SevenFiftyGramsBrowser(PagesBrowser):
DOMAIN = 'www.750g.com' BASEURL = 'http://www.750g.com/'
PROTOCOL = 'http'
ENCODING = 'windows-1252' search = URL('recettes_(?P<pattern>.*).htm', ResultsPage)
USER_AGENT = Browser.USER_AGENTS['wget'] recipe = URL('(?P<id>.*).htm', RecipePage)
PAGES = {
'http://www.750g.com/recettes_.*.htm': ResultsPage,
'http://www.750g.com/fiche_de_cuisine_complete.htm\?recettes_id=[0-9]*': RecipePage,
}
def iter_recipes(self, pattern): def iter_recipes(self, pattern):
self.location('http://www.750g.com/recettes_%s.htm' % (pattern.replace(' ', '_'))) return self.search.go(pattern=pattern.replace(' ', '_')).iter_recipes()
assert self.is_on_page(ResultsPage)
return self.page.iter_recipes()
def get_recipe(self, id): def get_recipe(self, id, recipe=None):
try: recipe = self.recipe.go(id=id).get_recipe(obj=recipe)
self.location('http://www.750g.com/fiche_de_cuisine_complete.htm?recettes_id=%s' % id) comments = list(self.page.get_comments())
except BrowserHTTPNotFound: if comments:
return recipe.comments = comments
if self.is_on_page(RecipePage): return recipe
return self.page.get_recipe(id)

View file

@ -48,16 +48,7 @@ class SevenFiftyGramsModule(Module, CapRecipe):
def fill_recipe(self, recipe, fields): def fill_recipe(self, recipe, fields):
if 'nb_person' in fields or 'instructions' in fields: if 'nb_person' in fields or 'instructions' in fields:
rec = self.get_recipe(recipe.id) recipe = self.browser.get_recipe(recipe.id, recipe)
recipe.picture_url = rec.picture_url
recipe.instructions = rec.instructions
recipe.ingredients = rec.ingredients
recipe.comments = rec.comments
recipe.author = rec.author
recipe.nb_person = rec.nb_person
recipe.cooking_time = rec.cooking_time
recipe.preparation_time = rec.preparation_time
return recipe return recipe
OBJECTS = { OBJECTS = {

View file

@ -19,128 +19,75 @@
from weboob.capabilities.recipe import Recipe, Comment from weboob.capabilities.recipe import Recipe, Comment
from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.base import NotAvailable
from weboob.deprecated.browser import Page from weboob.browser.pages import HTMLPage, pagination
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, Regexp, Env, Type, Filter
from weboob.browser.filters.html import CleanHTML
class ResultsPage(Page): class Time(Filter):
def filter(self, el):
if el:
if 'h' in el:
return 60*int(el.split()[0])
return int(el.split()[0])
class ResultsPage(HTMLPage):
""" Page which contains results as a list of recipies """ Page which contains results as a list of recipies
""" """
@pagination
@method
class iter_recipes(ListElement):
item_xpath = '//li[@data-type="recette"]'
def iter_recipes(self): def next_page(self):
for div in self.parser.select(self.document.getroot(), 'div.recette_description > div.data'): return CleanText('//li[@class="suivante"]/a/@href')(self)
links = self.parser.select(div, 'div.info > p.title > a.fn')
if len(links) > 0:
link = links[0]
title = unicode(link.text)
# id = unicode(link.attrib.get('href','').strip('/').replace('.htm','htm'))
id = unicode(self.parser.select(div, 'div.carnet-add a', 1).attrib.get('href', '').split('=')[-1])
thumbnail_url = NotAvailable
short_description = NotAvailable
imgs = self.parser.select(div, 'img.recipe-image') class item(ItemElement):
if len(imgs) > 0: klass = Recipe
thumbnail_url = unicode(imgs[0].attrib.get('src', '')) obj_id = Regexp(CleanText('./div[has-class("text")]/h2/a/@href'),
short_description = unicode(' '.join(self.parser.select( '(.*).htm')
div, 'div.infos_column', 1).text_content().split()).strip()) obj_title = CleanText('./div[has-class("text")]/h2/a')
imgs_cost = self.parser.select(div, 'div.infos_column img') obj_thumbnail_url = CleanText('./div[has-class("image")]/a/img/@src')
cost_tot = len(imgs_cost) obj_short_description = CleanText('./div[has-class("text")]/p')
cost_on = 0 obj_author = CleanText('./div[has-class("text")]/h3[@class="auteur"]/a', default=NotAvailable)
for img in imgs_cost:
if img.attrib.get('src', '').endswith('euro_on.png'):
cost_on += 1
short_description += u' %s/%s' % (cost_on, cost_tot)
recipe = Recipe(id, title)
recipe.thumbnail_url = thumbnail_url
recipe.short_description = short_description
recipe.instructions = NotLoaded
recipe.ingredients = NotLoaded
recipe.nb_person = NotLoaded
recipe.cooking_time = NotLoaded
recipe.preparation_time = NotLoaded
recipe.author = NotLoaded
yield recipe
class RecipePage(Page): class RecipePage(HTMLPage):
""" Page which contains a recipe """ Page which contains a recipe
""" """
@method
class get_comments(ListElement):
item_xpath = '//section[@class="commentaires_liste"]/article'
def get_recipe(self, id): class item(ItemElement):
title = NotAvailable klass = Comment
preparation_time = NotAvailable
cooking_time = NotAvailable
nb_person = NotAvailable
ingredients = NotAvailable
picture_url = NotAvailable
instructions = NotAvailable
author = NotAvailable
comments = NotAvailable
title = unicode(self.parser.select(self.document.getroot(), 'head > title', 1).text.split(' - ')[1]) obj_id = CleanText('./@data-id')
main = self.parser.select(self.document.getroot(), 'div.recette_description', 1) obj_author = CleanText('./div[@class="column"]/p[@class="commentaire_info"]/span')
obj_text = CleanText('./div[@class="column"]/p[1]')
rec_infos = self.parser.select(self.document.getroot(), 'div.recette_infos div.infos_column strong') @method
for info_title in rec_infos: class get_recipe(ItemElement):
if u'Temps de préparation' in unicode(info_title.text): klass = Recipe
if info_title.tail.strip() != '':
preparation_time = int(info_title.tail.split()[0])
if 'h' in info_title.tail:
preparation_time = 60*preparation_time
if 'Temps de cuisson' in info_title.text:
if info_title.tail.strip() != '':
cooking_time = int(info_title.tail.split()[0])
if 'h' in info_title.tail:
cooking_time = 60*cooking_time
if 'Nombre de personnes' in info_title.text:
if info_title.tail.strip() != '':
nb_person = [int(info_title.tail)]
ingredients = [] obj_id = Env('id')
p_ing = self.parser.select(main, 'div.data.top.left > div.content p') obj_title = CleanText('//h1[@class="fn"]')
for ing in p_ing:
ingtxt = unicode(ing.text_content().strip())
if ingtxt != '':
ingredients.append(ingtxt)
lines_instr = self.parser.select(main, 'div.data.top.right div.content li') def obj_ingredients(self):
if len(lines_instr) > 0: ingredients = []
instructions = u'' for el in self.page.doc.xpath('//section[has-class("recette_ingredients")]/ul/li'):
for line in lines_instr: ingredients.append(CleanText('.')(el))
inst = ' '.join(line.text_content().strip().split()) return ingredients
instructions += '%s\n' % inst
instructions = instructions.strip('\n')
imgillu = self.parser.select(self.document.getroot(), 'div.resume_recette_illustree img.photo') obj_cooking_time = Time(CleanText('//span[@class="cooktime"]'))
if len(imgillu) > 0: obj_preparation_time = Time(CleanText('//span[@class="preptime"]'))
picture_url = unicode(imgillu[0].attrib.get('src', ''))
divcoms = self.parser.select(self.document.getroot(), 'div.comment-outer') def obj_nb_person(self):
if len(divcoms) > 0: return [Type(CleanText('//span[@class="yield"]'), type=int)(self)]
comments = []
for divcom in divcoms:
comtxt = unicode(' '.join(divcom.text_content().strip().split()))
if u'| Répondre' in comtxt:
comtxt = comtxt.strip('0123456789').replace(u' | Répondre', '')
author = None
if 'par ' in comtxt:
author = comtxt.split('par ')[-1].split('|')[0]
comtxt = comtxt.replace('par %s' % author, '')
comments.append(Comment(text=comtxt, author=author))
links_author = self.parser.select(self.document.getroot(), 'p.auteur a.couleur_membre') obj_instructions = CleanHTML('//div[@class="recette_etapes"]')
if len(links_author) > 0: obj_picture_url = CleanText('//section[has-class("recette_infos")]/div/img[@class="photo"]/@src')
author = unicode(links_author[0].text.strip()) obj_author = CleanText('//span[@class="author"]', default=NotAvailable)
recipe = Recipe(id, title)
recipe.preparation_time = preparation_time
recipe.cooking_time = cooking_time
recipe.nb_person = nb_person
recipe.ingredients = ingredients
recipe.instructions = instructions
recipe.picture_url = picture_url
recipe.comments = comments
recipe.author = author
recipe.thumbnail_url = NotLoaded
return recipe

View file

@ -18,13 +18,13 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.test import BackendTest from weboob.tools.test import BackendTest
import itertools
class SevenFiftyGramsTest(BackendTest): class SevenFiftyGramsTest(BackendTest):
MODULE = '750g' MODULE = '750g'
def test_recipe(self): def test_recipe(self):
recipes = self.backend.iter_recipes('fondue') recipes = list(itertools.islice(self.backend.iter_recipes('fondue'), 0, 20))
for recipe in recipes: for recipe in recipes:
full_recipe = self.backend.get_recipe(recipe.id) full_recipe = self.backend.get_recipe(recipe.id)
assert full_recipe.instructions assert full_recipe.instructions

View file

@ -18,7 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser, BrowserHTTPNotFound from weboob.browser import PagesBrowser, URL
from .pages import RecipePage, ResultsPage from .pages import RecipePage, ResultsPage
@ -26,25 +26,17 @@ from .pages import RecipePage, ResultsPage
__all__ = ['MarmitonBrowser'] __all__ = ['MarmitonBrowser']
class MarmitonBrowser(Browser): class MarmitonBrowser(PagesBrowser):
DOMAIN = 'www.marmiton.org' BASEURL = 'http://www.marmiton.org/'
PROTOCOL = 'http' search = URL('recettes/recherche.aspx\?aqt=(?P<pattern>.*)', ResultsPage)
ENCODING = 'utf-8' recipe = URL('recettes/recette_(?P<id>.*).aspx', RecipePage)
USER_AGENT = Browser.USER_AGENTS['wget']
PAGES = {
'http://www.marmiton.org/recettes/recherche.aspx.*': ResultsPage,
'http://www.marmiton.org/recettes/recette_.*': RecipePage,
}
def iter_recipes(self, pattern): def iter_recipes(self, pattern):
self.location('http://www.marmiton.org/recettes/recherche.aspx?st=5&cli=1&aqt=%s' % (pattern)) return self.search.go(pattern=pattern).iter_recipes()
assert self.is_on_page(ResultsPage)
return self.page.iter_recipes()
def get_recipe(self, id): def get_recipe(self, id, recipe=None):
try: recipe = self.recipe.go(id=id).get_recipe(obj=recipe)
self.location('http://www.marmiton.org/recettes/recette_%s.aspx' % id) comments = list(self.page.get_comments())
except BrowserHTTPNotFound: if comments:
return recipe.comments = comments
if self.is_on_page(RecipePage): return recipe
return self.page.get_recipe(id)

View file

@ -44,16 +44,7 @@ class MarmitonModule(Module, CapRecipe):
def fill_recipe(self, recipe, fields): def fill_recipe(self, recipe, fields):
if 'nb_person' in fields or 'instructions' in fields: if 'nb_person' in fields or 'instructions' in fields:
rec = self.get_recipe(recipe.id) recipe = self.browser.get_recipe(recipe.id, recipe)
recipe.picture_url = rec.picture_url
recipe.instructions = rec.instructions
recipe.ingredients = rec.ingredients
recipe.comments = rec.comments
recipe.author = rec.author
recipe.nb_person = rec.nb_person
recipe.cooking_time = rec.cooking_time
recipe.preparation_time = rec.preparation_time
return recipe return recipe
OBJECTS = { OBJECTS = {

View file

@ -17,93 +17,71 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.browser.pages import HTMLPage, pagination
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import Regexp, CleanText, Format, Env, Type
from weboob.browser.filters.html import CleanHTML
from weboob.capabilities.recipe import Recipe, Comment from weboob.capabilities.recipe import Recipe, Comment
from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.base import NotAvailable
from weboob.deprecated.browser import Page
class ResultsPage(Page): class ResultsPage(HTMLPage):
""" Page which contains results as a list of recipies """ Page which contains results as a list of recipies
""" """
@pagination
@method
class iter_recipes(ListElement):
item_xpath = '//div[has-class("recette_classique")]'
def iter_recipes(self): def next_page(self):
for div in self.parser.select(self.document.getroot(), 'div.m_search_result'): return CleanText('//a[@id="ctl00_cphMainContent_m_ctrlSearchEngine_m_ctrlSearchListDisplay_m_ctrlSearchPagination_m_linkNextPage"]/@href',
tds = self.parser.select(div, 'td') default=None)(self)
if len(tds) == 2:
title = NotAvailable
thumbnail_url = NotAvailable
short_description = NotAvailable
imgs = self.parser.select(tds[0], 'img')
if len(imgs) > 0:
thumbnail_url = unicode(imgs[0].attrib.get('src', ''))
link = self.parser.select(tds[1], 'div.m_search_titre_recette a', 1)
title = unicode(link.text)
id = link.attrib.get('href', '').replace('.aspx', '').replace('/recettes/recette_', '')
short_description = unicode(' '.join(self.parser.select(tds[
1], 'div.m_search_result_part4', 1).text.strip().split('\n')))
recipe = Recipe(id, title) class item(ItemElement):
recipe.thumbnail_url = thumbnail_url klass = Recipe
recipe.short_description = short_description obj_id = Regexp(CleanText('./div/div[@class="m_titre_resultat"]/a/@href'),
recipe.instructions = NotLoaded '/recettes/recette_(.*).aspx')
recipe.author = NotLoaded obj_title = CleanText('./div/div[@class="m_titre_resultat"]/a')
recipe.ingredients = NotLoaded obj_thumbnail_url = CleanText('./a[@class="m_resultat_lien_image"]', default='')
recipe.nb_person = NotLoaded obj_short_description = Format('%s. %s',
recipe.cooking_time = NotLoaded CleanText('./div/div[@class="m_detail_recette"]'),
recipe.preparation_time = NotLoaded CleanText('./div/div[@class="m_texte_resultat"]'))
yield recipe
class RecipePage(Page): class RecipePage(HTMLPage):
""" Page which contains a recipe """ Page which contains a recipe
""" """
@method
class get_recipe(ItemElement):
klass = Recipe
def get_recipe(self, id): obj_id = Env('id')
title = NotAvailable obj_title = CleanText('//h1[@class="m_title"]')
preparation_time = NotAvailable obj_preparation_time = Type(CleanText('//span[@class="preptime"]'), type=int)
cooking_time = NotAvailable obj_cooking_time = Type(CleanText('//span[@class="cooktime"]'), type=int)
nb_person = NotAvailable
ingredients = NotAvailable
picture_url = NotAvailable
instructions = NotAvailable
comments = NotAvailable
title = unicode(self.parser.select(self.document.getroot(), 'h1.m_title', 1).text_content().strip()) def obj_nb_person(self):
main = self.parser.select(self.document.getroot(), 'div.m_content_recette_main', 1) nb_pers = Regexp(CleanText('//p[@class="m_content_recette_ingredients"]/span[1]'),
preparation_time = int(self.parser.select(main, 'p.m_content_recette_info span.preptime', 1).text_content()) '.*\(pour (\d+) personnes\)', default=0)(self)
cooking_time = int(self.parser.select(main, 'p.m_content_recette_info span.cooktime', 1).text_content()) return [nb_pers] if nb_pers else NotAvailable
ing_header_line = self.parser.select(main, 'p.m_content_recette_ingredients span', 1).text_content()
if '(pour' in ing_header_line and ')' in ing_header_line:
nb_person = [int(ing_header_line.split('pour ')[-1].split('personnes)')[0].split()[0])]
ingredients = self.parser.select(main, 'p.m_content_recette_ingredients', 1).text_content().strip().split('- ')
ingredients = ingredients[1:]
rinstructions = self.parser.select(main, 'div.m_content_recette_todo', 1).text_content().strip()
instructions = u''
for line in rinstructions.split('\n'):
instructions += '%s\n' % line.strip()
instructions = instructions.strip('\n')
imgillu = self.parser.select(self.document.getroot(), 'a.m_content_recette_illu img')
if len(imgillu) > 0:
picture_url = unicode(imgillu[0].attrib.get('src', ''))
divcoms = self.parser.select(self.document.getroot(), 'div.m_commentaire_row') def obj_ingredients(self):
if len(divcoms) > 0: ingredients = CleanText('//p[@class="m_content_recette_ingredients"]', default='')(self).split('-')
comments = [] if len(ingredients) > 1:
for divcom in divcoms: return ingredients[1:]
note = self.parser.select(divcom, 'div.m_commentaire_note span', 1).text.strip()
user = self.parser.select(divcom, 'div.m_commentaire_content span', 1).text.strip()
content = self.parser.select(divcom, 'div.m_commentaire_content p', 1).text.strip()
comments.append(Comment(author=user, rate=note, text=content))
recipe = Recipe(id, title) obj_instructions = CleanHTML('//div[@class="m_content_recette_todo"]')
recipe.preparation_time = preparation_time obj_picture_url = CleanText('//a[@class="m_content_recette_illu"]/@href', default=NotAvailable)
recipe.cooking_time = cooking_time
recipe.nb_person = nb_person @method
recipe.ingredients = ingredients class get_comments(ListElement):
recipe.instructions = instructions item_xpath = '//div[@class="m_commentaire_row"]'
recipe.picture_url = picture_url ignore_duplicate = True
recipe.comments = comments
recipe.thumbnail_url = NotLoaded class item(ItemElement):
recipe.author = NotAvailable klass = Comment
return recipe
obj_author = CleanText('./div[@class="m_commentaire_content"]/span[1]')
obj_rate = CleanText('./div[@class="m_commentaire_note"]/span')
obj_text = CleanText('./div[@class="m_commentaire_content"]/p[1]')
obj_id = CleanText('./div[@class="m_commentaire_content"]/span[1]')

View file

@ -18,13 +18,13 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.test import BackendTest from weboob.tools.test import BackendTest
import itertools
class MarmitonTest(BackendTest): class MarmitonTest(BackendTest):
MODULE = 'marmiton' MODULE = 'marmiton'
def test_recipe(self): def test_recipe(self):
recipes = self.backend.iter_recipes('fondue') recipes = list(itertools.islice(self.backend.iter_recipes('fondue'), 0, 20))
for recipe in recipes: for recipe in recipes:
full_recipe = self.backend.get_recipe(recipe.id) full_recipe = self.backend.get_recipe(recipe.id)
assert full_recipe.instructions assert full_recipe.instructions

View file

@ -63,7 +63,7 @@ class Recipe(BaseObject):
instructions = StringField('Instruction step list of the recipe') instructions = StringField('Instruction step list of the recipe')
comments = Field('User comments about the recipe', list) comments = Field('User comments about the recipe', list)
def __init__(self, id, title): def __init__(self, id='', title=u''):
BaseObject.__init__(self, id) BaseObject.__init__(self, id)
self.title = title self.title = title