diff --git a/modules/cci/pages.py b/modules/cci/pages.py index d3f034f4..0c7547e6 100644 --- a/modules/cci/pages.py +++ b/modules/cci/pages.py @@ -61,7 +61,7 @@ class SearchPage(HTMLPage): klass = BaseJobAdvert obj_url = Format('%s#%s', Env('url'), Env('id')) - obj_description = Join('%s\r\n', + obj_description = Join('\r\n', 'div/fieldset/*[(@class="titreParagraphe" or @class="normal")]', textCleaner=CleanHTML) obj_title = CleanText('div/span[@class="intituleposte"]') diff --git a/modules/monster/pages.py b/modules/monster/pages.py index f562860b..8b0a7265 100644 --- a/modules/monster/pages.py +++ b/modules/monster/pages.py @@ -73,7 +73,7 @@ class AdvertPage(HTMLPage): obj_url = BrowserURL('advert', _id=Env('_id')) obj_title = CleanText('//div[@id="jobcopy"]/h1[@itemprop="title"]|//div[@itemprop="title"]/h1') obj_description = CleanHTML('//div[@id="jobBodyContent"]|//div[@itemprop="description"]') - obj_contract_type = Join('%s ', '//dd[starts-with(@class, "multipledd")]') + obj_contract_type = Join(' ', '//dd[starts-with(@class, "multipledd")]') obj_society_name = CleanText('//dd[@itemprop="hiringOrganization"]') obj_place = CleanText('//span[@itemprop="jobLocation"]') obj_pay = CleanText('//span[@itemprop="baseSalary"]') diff --git a/modules/regionsjob/pages.py b/modules/regionsjob/pages.py index 4e5fa9e8..4b76ed3f 100644 --- a/modules/regionsjob/pages.py +++ b/modules/regionsjob/pages.py @@ -64,7 +64,7 @@ class AdvertPage(HTMLPage): class get_job_advert(ItemElement): klass = BaseJobAdvert - obj_description = Join('\n%s', '//div[@id="annonce-detail"]/p[@class="text"]', textCleaner=CleanHTML) + obj_description = Join('\n', '//div[@id="annonce-detail"]/p[@class="text"]', textCleaner=CleanHTML) obj_id = Env('_id') obj_url = BrowserURL('advert_page', _id=Env('_id')) obj_publication_date = Date(Regexp(CleanText('//div[@id="annonce-detail"]/p[@class="infos"]'), diff --git a/modules/senscritique/pages.py b/modules/senscritique/pages.py index d0e3caa1..d834e760 100644 --- a/modules/senscritique/pages.py +++ b/modules/senscritique/pages.py @@ -125,8 +125,8 @@ class Description(Filter): return Format(u'%s %s\n\n%s%s\n\n', CleanText("%s/div[@class='d-rubric-inner']/h1" % header), CleanText("%s/div[@class='d-rubric-inner']/small" % header), - Join(u'- %s\n', "%s/ul[@class='pvi-product-specs']/li" % header), - Join(u'- %s\n', "%s/ul/li" % section))(el[0]) + Join(u'- ', "%s/ul[@class='pvi-product-specs']/li" % header, newline=True), + Join(u'- ', "%s/ul/li" % section, newline=True, addBefore=' - '))(el[0]) class EventPage(HTMLPage): diff --git a/weboob/browser/filters/standard.py b/weboob/browser/filters/standard.py index b597d5fc..38505b77 100644 --- a/weboob/browser/filters/standard.py +++ b/weboob/browser/filters/standard.py @@ -690,19 +690,31 @@ class BrowserURL(MultiFilter): class Join(Filter): - def __init__(self, pattern, selector=None, textCleaner=CleanText): + def __init__(self, pattern, selector=None, textCleaner=CleanText, newline=False, addBefore='', addAfter=''): super(Join, self).__init__(selector) self.pattern = pattern self.textCleaner = textCleaner + self.newline = newline + self.addBefore = addBefore + self.addAfter = addAfter @debug() def filter(self, el): - res = u'' - for li in el: - res += self.pattern % self.textCleaner.clean(li) + items = [self.textCleaner.clean(e) for e in el] + items = [item for item in items if item] - return res + if self.newline: + items = ['%s\r\n' % item for item in items] + result = self.pattern.join(items) + + if self.addBefore: + result = '%s%s' % (self.addBefore, result) + + if self.addAfter: + result = '%s%s' % (result, self.addAfter) + + return result class Eval(MultiFilter): """