#DeepLearning #SupervisedLearning #CNN

By Billy Gustave

Data Gathering / Web scrapping with Scrapy

Goal

In [1]:
import scrapy, logging, pandas as pd
from scrapy.crawler import CrawlerRunner
from os import path
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
In [2]:
class CategoriesSpider(scrapy.Spider):
    name = 'categories'
    start_urls = ['https://www.trustpilot.com/categories']
    recorded_urls = []
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'categories.csv'
    }
    
    def parse(self, response):
        for category in response.xpath('//div[@class="subCategory___BRUDy"]/h3/a'):
            url = category.xpath('./@href').extract_first()
            if url and url not in self.recorded_urls:
                self.recorded_urls.append(url)
                categoryText = category.xpath('./span/text()').extract_first()
                filename = url.strip().split('/categories/')[1]
                yield {
                    'Categories': categoryText.strip() if categoryText else filename,
                    'File_Names': filename,
                    'Subcategories': None,
                    'Urls': 'https://www.trustpilot.com' + url.strip()
           }
In [3]:
class SubcategoriesSpider(scrapy.Spider):
    name = 'subcategories'
    def start_requests(self):
        df = pd.read_csv(r'categories.csv').drop_duplicates(subset='Urls', keep='first')
        for url in df['Urls']:
            yield scrapy.Request(str(url))
    recorded_urls = []
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'subcategories.csv'
    }
    
    def parse(self, response):
        category = response.xpath('//div[@class="listCurrentMidLevel___2Tpg7"]/label/text()').extract_first()
        for subcategory in response.xpath('//ul[@class="subCategoriesList___1O-nM"]/li/a'):
            url = subcategory.xpath('./@href').extract_first().strip()
            if url and url != '#' and url not in self.recorded_urls:
                self.recorded_urls.append(url)
                subcategoryText = subcategory.xpath('./span/text()').extract_first()
                yield {
                    'Categories': category.strip() if category else None,
                    'Subcategories': subcategoryText.strip() if subcategoryText else None,
                    'Urls':'https://www.trustpilot.com' + url
                }
In [4]:
class CompaniesSpider(scrapy.Spider):
    name = 'subcategories'
    def start_requests(self):
        cat_df = pd.read_csv(r'categories.csv').drop_duplicates(subset='Urls', keep='first')
        subcat_df = pd.read_csv(r'subcategories.csv').drop_duplicates(subset='Urls', keep='first')
        all_df = pd.concat([cat_df.drop('File_Names', axis=1), subcat_df]).drop_duplicates(subset='Urls', keep='first')
        for url in all_df['Urls']:
            yield scrapy.Request(str(url))
    recorded_urls = []
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'companies.csv'
    }
    
    def parse(self, response):
        companies_section = response.xpath(
            '//div[contains(@class,"businessUnitCardsContainer___1iAt9") or contains(@class,"businessUnitCardsContainer___1Ez9Z")]')
        for company in companies_section.xpath('.//a[@class="wrapper___2rOTx"]'):
            url = company.xpath('./@href').extract_first()
            if url and url not in self.recorded_urls:
                self.recorded_urls.append(url)
                companyText = company.xpath('.//div[@class="businessTitle___152-c"]/text()').extract_first()
                review = company.xpath('.//div[@class="textRating___3F1NO"]/text()').extract_first()
                rating = company.xpath('.//div[@class="textRating___3F1NO"]/text()').extract()
                logo = company.xpath('.//div[@class="logoBox___2MR-l"]/img/@src').extract_first()
                yield {
                    'Company': companyText.strip() if companyText else url.strip().split('/review/')[-1],
                    'Reviews': int(review.strip().split()[1]) if review else None, 
                    'Rating': float(rating[-1].strip().split()[0]) if rating else None,
                    'Logo': logo.strip() if logo else '//cdn.trustpilot.net/consumer-site/placeholder_logo.svg',
                    'Urls': 'https://www.trustpilot.com' + url.strip()
                }
        
        next_page = response.xpath('//a[@data-pagination-button-next-paginationlink="true"]/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
In [5]:
class ReviewsSpider(scrapy.Spider):
    name = 'subcategories'
    page_counter = 0
    max_page_limit = 50
    def start_requests(self):
        df = pd.read_csv(r'companies.csv').drop_duplicates(subset='Urls', keep='first')
        for url in df['Urls']:
            yield scrapy.Request(str(url))
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'companies_reviews.csv',
        'CONCURRENT_REQUESTS': 32
    }
    
    def parse(self, response):
        logo = response.xpath('//img[@class="business-unit-profile-summary__image"]/@src').extract_first()
        name = response.xpath('//span[@class="multi-size-header__big"]/text()').extract_first()
        website = response.xpath('//div[@class="badge-card"]/a/@href').extract_first().split('?')[0]
        for reviews_content in response.xpath('//div[@class="review-content"]'):
            comment_text = reviews_content.xpath('.//p[@class="review-content__text"]/text()').extract_first()
            comment_title = reviews_content.xpath('.//h2[@class="review-content__title"]/a/text()').extract_first()
            rating = reviews_content.xpath('.//div[@class="star-rating star-rating--medium"]/img/@alt').extract_first()
            yield {
                'Company_Name': name.strip() if name else website,
                'Company_Site': website.strip() if website else None,
                'Company_Logo': logo.strip() if logo else '//cdn.trustpilot.net/consumer-site/placeholder_logo.svg',
                'Comment': comment_text.strip() if comment_text else comment_title,
                'Rating': int(rating.split('star')[0]) if rating else -1
            }
        
        next_page = response.xpath('//a[@class="button button--primary next-page"]/@href').extract_first()
        if next_page is not None and 'page='+str(self.max_page_limit+1) not in next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
In [6]:
%%time
@defer.inlineCallbacks
def crawl():
    #yield runner.crawl(CategoriesSpider)
    #yield runner.crawl(SubcategoriesSpider)
    #yield runner.crawl(CompaniesSpider)
    yield runner.crawl(ReviewsSpider)
    reactor.stop()


configure_logging()
config = get_project_settings()
runner = CrawlerRunner(settings=config)
crawl()
reactor.run()
2020-03-14 18:54:18 [scrapy.crawler] INFO: Overridden settings: {'CONCURRENT_REQUESTS': 32, 'FEED_FORMAT': 'csv', 'FEED_URI': 'companies_reviews.csv', 'LOG_LEVEL': 30}
Wall time: 2h 28min 22s
In [ ]:
 
In [7]:
df = pd.read_csv(r'companies_reviews.csv')
df.head()
Out[7]:
Company_Name Company_Site Company_Logo Comment Rating
0 Fragrancebuy Canada http://fragrancebuy.ca //s3-eu-west-1.amazonaws.com/tpd/logos/597befb... Excellent service 5
1 Fragrancebuy Canada http://fragrancebuy.ca //s3-eu-west-1.amazonaws.com/tpd/logos/597befb... Bought perfume as i was given a gift card , ho... 1
2 Fragrancebuy Canada http://fragrancebuy.ca //s3-eu-west-1.amazonaws.com/tpd/logos/597befb... A new Fragrance Buy customer. I recently purc... 5
3 Fragrancebuy Canada http://fragrancebuy.ca //s3-eu-west-1.amazonaws.com/tpd/logos/597befb... Excellent fast delivery to the USA. Packaging ... 5
4 Fragrancebuy Canada http://fragrancebuy.ca //s3-eu-west-1.amazonaws.com/tpd/logos/597befb... Excellence experience!! First time buyer from ... 5
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2865324 entries, 0 to 2865323
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   Company_Name  object
 1   Company_Site  object
 2   Company_Logo  object
 3   Comment       object
 4   Rating        int64 
dtypes: int64(1), object(4)
memory usage: 109.3+ MB
import requests res = requests.get('https://www.trustpilot.com/review/zestmoney.in') response = scrapy.http.TextResponse(res.url, body=res.text, encoding='utf-8')
In [9]:
import matplotlib.pyplot as plt
df.Rating.hist()
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x15a9d77c948>
In [10]:
df.Rating.value_counts()
Out[10]:
5    2174278
1     271560
4     255292
3      98008
2      66186
Name: Rating, dtype: int64
In [ ]:
 

Contact Me

www.linkedin.com/in/billygustave

billygustave.com