[allrecipes] adapt to new version of website / bump to browser2

This commit is contained in:
Bezleputh 2015-09-16 13:23:51 +02:00
commit 159311bf8e
4 changed files with 103 additions and 153 deletions

View file

@ -16,36 +16,24 @@
# #
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.browser import PagesBrowser, URL
from .pages import ResultsPage, RecipePage
from weboob.deprecated.browser import Browser, BrowserHTTPNotFound
from .pages import RecipePage, ResultsPage, FourOFourPage
__all__ = ['AllrecipesBrowser'] __all__ = ['AllrecipesBrowser']
class AllrecipesBrowser(Browser): class AllrecipesBrowser(PagesBrowser):
DOMAIN = 'allrecipes.com' BASEURL = 'http://allrecipes.com'
PROTOCOL = 'http' results = URL('search/results/\?wt=(?P<pattern>.*)\&sort=re',
ENCODING = 'utf-8' 'recipes/.*', ResultsPage)
USER_AGENT = Browser.USER_AGENTS['wget'] recipe = URL('recipe/(?P<_id>.*)/', RecipePage)
PAGES = {
'http://allrecipes.com/search/default.aspx\?qt=k&wt=.*&rt=r&origin=.*': ResultsPage,
'http://allrecipes.com/Recipe/.*/Detail.aspx': RecipePage,
'http://allrecipes.com/404.aspx.*': FourOFourPage
}
def iter_recipes(self, pattern): def iter_recipes(self, pattern):
self.location('http://allrecipes.com/search/default.aspx?qt=k&wt=%s&rt=r&origin=Home%%20Page' % (pattern)) return self.results.go(pattern=pattern).iter_recipes()
assert self.is_on_page(ResultsPage)
return self.page.iter_recipes()
def get_recipe(self, id): def get_recipe(self, _id, obj=None):
try: recipe = self.recipe.go(_id=_id).get_recipe(obj=obj)
self.location('http://allrecipes.com/Recipe/%s/Detail.aspx' % id) comments = list(self.page.get_comments())
except BrowserHTTPNotFound: if comments:
return recipe.comments = comments
if self.is_on_page(RecipePage): return recipe
return self.page.get_recipe(id)

View file

@ -43,19 +43,8 @@ class AllrecipesModule(Module, CapRecipe):
return self.browser.iter_recipes(quote_plus(pattern.encode('utf-8'))) return self.browser.iter_recipes(quote_plus(pattern.encode('utf-8')))
def fill_recipe(self, recipe, fields): def fill_recipe(self, recipe, fields):
if 'nb_person' in fields or 'instructions' in fields: if 'nb_person' in fields or 'instructions' in fields or 'thumbnail_url' in fields:
rec = self.get_recipe(recipe.id) recipe = self.browser.get_recipe(recipe.id, recipe)
recipe.picture_url = rec.picture_url
recipe.instructions = rec.instructions
recipe.ingredients = rec.ingredients
recipe.comments = rec.comments
recipe.author = rec.author
recipe.nb_person = rec.nb_person
recipe.cooking_time = rec.cooking_time
recipe.preparation_time = rec.preparation_time
return recipe return recipe
OBJECTS = { OBJECTS = {Recipe: fill_recipe}
Recipe: fill_recipe,
}

View file

@ -18,109 +18,80 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.recipe import Recipe from weboob.browser.pages import HTMLPage, pagination
from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.browser.elements import ItemElement, ListElement, method
from weboob.deprecated.browser import Page from weboob.capabilities.recipe import Recipe, Comment
from weboob.capabilities.base import NotAvailable
from weboob.browser.filters.standard import Regexp, CleanText, Env, Duration
from weboob.browser.filters.html import CleanHTML
import re
class FourOFourPage(Page): class CookingDuration(Duration):
pass _regexp = re.compile(r'PT((?P<hh>\d+)H)?((?P<mm>\d+)M)?((?P<ss>\d+)S)?')
class ResultsPage(Page): class ResultsPage(HTMLPage):
""" Page which contains results as a list of recipies @pagination
""" @method
class iter_recipes(ListElement):
item_xpath = '//article[@class="grid-col--fixed-tiles"]'
def iter_recipes(self): def next_page(self):
for div in self.parser.select(self.document.getroot(), 'div.recipe-info'): return CleanText('//button[@id="btnMoreResults"]/@href')(self)
thumbnail_url = NotAvailable
short_description = NotAvailable
imgs = self.parser.select(div.getparent(), 'img')
if len(imgs) > 0:
url = unicode(imgs[0].attrib.get('src', ''))
if url.startswith('http://'):
thumbnail_url = url
link = self.parser.select(div, 'a.title', 1) class item(ItemElement):
title = unicode(link.text) klass = Recipe
id = unicode(link.attrib.get('href', '').split('/')[2])
recipe = Recipe(id, title) obj_id = Regexp(CleanText('./a[1]/@href'),
recipe.thumbnail_url = thumbnail_url '/recipe/(.*)/')
recipe.short_description = short_description obj_title = CleanText('./a/h3')
recipe.instructions = NotLoaded obj_short_description = CleanText('./a/div/div[@class="rec-card__description"]')
recipe.ingredients = NotLoaded
recipe.nb_person = NotLoaded
recipe.cooking_time = NotLoaded
recipe.preparation_time = NotLoaded
recipe.author = NotLoaded
yield recipe
class RecipePage(Page): class RecipePage(HTMLPage):
""" Page which contains a recipe @method
""" class get_recipe(ItemElement):
klass = Recipe
def get_recipe(self, id): obj_id = Env('_id')
title = NotAvailable obj_title = CleanText('//h1[@itemprop="name"]')
preparation_time = NotAvailable
cooking_time = NotAvailable
author = NotAvailable
nb_person = NotAvailable
ingredients = NotAvailable
picture_url = NotAvailable
instructions = NotAvailable
comments = NotAvailable
title = unicode(self.parser.select(self.document.getroot(), 'h1#itemTitle', 1).text) def obj_preparation_time(self):
imgillu = self.parser.select(self.document.getroot(), 'img#imgPhoto') dt = CookingDuration(CleanText('//time[@itemprop="prepTime"]/@datetime'))(self)
if len(imgillu) > 0: return int(dt.total_seconds() / 60)
picture_url = unicode(imgillu[0].attrib.get('src', ''))
ingredients = [] def obj_cooking_time(self):
l_ing = self.parser.select(self.document.getroot(), 'li#liIngredient') dt = CookingDuration(CleanText('//time[@itemprop="cookTime"]/@datetime'))(self)
for ing in l_ing: return int(dt.total_seconds() / 60)
ingtxt = unicode(ing.text_content().strip())
if ingtxt != '':
ingredients.append(ingtxt)
instructions = u'' def obj_nb_person(self):
l_divinst = self.parser.select(self.document.getroot(), 'div.directLeft li') nb_pers = CleanText('//meta[@id="metaRecipeServings"]/@content')(self)
num_instr = 1 return [nb_pers] if nb_pers else NotAvailable
for inst in l_divinst:
instructions += '%s: %s\n' % (num_instr, inst.text_content())
num_instr += 1
prepmin = 0 def obj_ingredients(self):
emprep = self.parser.select(self.document.getroot(), 'span#prepHoursSpan em') ingredients = []
if len(emprep) > 0: for el in self.el.xpath('//ul[has-class("checklist")]/li/label/span[@itemprop="ingredients"]'):
prepmin += int(emprep[0].text) * 60 ing = CleanText('.')(el)
emprep = self.parser.select(self.document.getroot(), 'span#prepMinsSpan em') if ing:
if len(emprep) > 0: ingredients.append(ing)
prepmin += int(emprep[0].text) return ingredients
if prepmin != 0:
preparation_time = prepmin
cookmin = 0
emcooktime = self.parser.select(self.document.getroot(), 'span#cookHoursSpan em')
if len(emcooktime) > 0:
cookmin += int(emcooktime[0].text) * 60
emcooktime = self.parser.select(self.document.getroot(), 'span#cookMinsSpan em')
if len(emcooktime) > 0:
cookmin += int(emcooktime[0].text)
if cookmin != 0:
cooking_time = cookmin
l_nbpers = self.parser.select(self.document.getroot(), 'span#lblYield[itemprop=recipeYield]')
if len(l_nbpers) > 0 and 'servings' in l_nbpers[0].text:
nb_person = [int(l_nbpers[0].text.split()[0])]
recipe = Recipe(id, title) obj_instructions = CleanHTML('//ol[@itemprop="recipeInstructions"]')
recipe.preparation_time = preparation_time obj_thumbnail_url = CleanText('//section[has-class("hero-photo")]/span/a/img/@src')
recipe.cooking_time = cooking_time
recipe.nb_person = nb_person obj_picture_url = CleanText('//section[has-class("hero-photo")]/span/a/img/@src')
recipe.ingredients = ingredients
recipe.instructions = instructions @method
recipe.picture_url = picture_url class get_comments(ListElement):
recipe.comments = comments item_xpath = '//div[@itemprop="review"]'
recipe.author = author ignore_duplicate = True
recipe.thumbnail_url = NotLoaded
return recipe class item(ItemElement):
klass = Comment
obj_author = CleanText('./article/a/div/a/ul/li/h4[@itemprop="author"]')
obj_rate = CleanText('./article/div/div[@class="rating-stars"]/@data-ratingstars')
obj_text = CleanText('./p[@itemprop="reviewBody"]')
obj_id = CleanText('./article/a/@href')

View file

@ -19,14 +19,16 @@
from weboob.tools.test import BackendTest from weboob.tools.test import BackendTest
import itertools
class AllrecipesTest(BackendTest): class AllrecipesTest(BackendTest):
MODULE = 'allrecipes' MODULE = 'allrecipes'
def test_recipe(self): def test_recipe(self):
recipes = self.backend.iter_recipes('french fries') recipes = list(itertools.islice(self.backend.iter_recipes('french fries'), 0, 20))
for recipe in recipes: assert len(recipes)
full_recipe = self.backend.get_recipe(recipe.id) full_recipe = self.backend.get_recipe(recipes[0].id)
assert full_recipe.instructions assert full_recipe.instructions
assert full_recipe.ingredients assert full_recipe.ingredients
assert full_recipe.title assert full_recipe.title