Data Gathering / Web scrapping with Scrapy ¶

Goal

Classify Trustpilot comments using Character based CNN
Tensorflow with Keras
Data model Code: http://www.billygustave.com/ds_projects/Trustpilot_CNN_Code/
Deploy end product to production: http://www.billygustave.com/ds_projects/Trustpilot_sentiment/

import scrapy, logging, pandas as pd
from scrapy.crawler import CrawlerRunner
from os import path
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings

class CategoriesSpider(scrapy.Spider):
    name = 'categories'
    start_urls = ['https://www.trustpilot.com/categories']
    recorded_urls = []
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'categories.csv'
    }
    
    def parse(self, response):
        for category in response.xpath('//div[@class="subCategory___BRUDy"]/h3/a'):
            url = category.xpath('./@href').extract_first()
            if url and url not in self.recorded_urls:
                self.recorded_urls.append(url)
                categoryText = category.xpath('./span/text()').extract_first()
                filename = url.strip().split('/categories/')[1]
                yield {
                    'Categories': categoryText.strip() if categoryText else filename,
                    'File_Names': filename,
                    'Subcategories': None,
                    'Urls': 'https://www.trustpilot.com' + url.strip()
           }

class SubcategoriesSpider(scrapy.Spider):
    name = 'subcategories'
    def start_requests(self):
        df = pd.read_csv(r'categories.csv').drop_duplicates(subset='Urls', keep='first')
        for url in df['Urls']:
            yield scrapy.Request(str(url))
    recorded_urls = []
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'subcategories.csv'
    }
    
    def parse(self, response):
        category = response.xpath('//div[@class="listCurrentMidLevel___2Tpg7"]/label/text()').extract_first()
        for subcategory in response.xpath('//ul[@class="subCategoriesList___1O-nM"]/li/a'):
            url = subcategory.xpath('./@href').extract_first().strip()
            if url and url != '#' and url not in self.recorded_urls:
                self.recorded_urls.append(url)
                subcategoryText = subcategory.xpath('./span/text()').extract_first()
                yield {
                    'Categories': category.strip() if category else None,
                    'Subcategories': subcategoryText.strip() if subcategoryText else None,
                    'Urls':'https://www.trustpilot.com' + url
                }

class CompaniesSpider(scrapy.Spider):
    name = 'subcategories'
    def start_requests(self):
        cat_df = pd.read_csv(r'categories.csv').drop_duplicates(subset='Urls', keep='first')
        subcat_df = pd.read_csv(r'subcategories.csv').drop_duplicates(subset='Urls', keep='first')
        all_df = pd.concat([cat_df.drop('File_Names', axis=1), subcat_df]).drop_duplicates(subset='Urls', keep='first')
        for url in all_df['Urls']:
            yield scrapy.Request(str(url))
    recorded_urls = []
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'companies.csv'
    }
    
    def parse(self, response):
        companies_section = response.xpath(
            '//div[contains(@class,"businessUnitCardsContainer___1iAt9") or contains(@class,"businessUnitCardsContainer___1Ez9Z")]')
        for company in companies_section.xpath('.//a[@class="wrapper___2rOTx"]'):
            url = company.xpath('./@href').extract_first()
            if url and url not in self.recorded_urls:
                self.recorded_urls.append(url)
                companyText = company.xpath('.//div[@class="businessTitle___152-c"]/text()').extract_first()
                review = company.xpath('.//div[@class="textRating___3F1NO"]/text()').extract_first()
                rating = company.xpath('.//div[@class="textRating___3F1NO"]/text()').extract()
                logo = company.xpath('.//div[@class="logoBox___2MR-l"]/img/@src').extract_first()
                yield {
                    'Company': companyText.strip() if companyText else url.strip().split('/review/')[-1],
                    'Reviews': int(review.strip().split()[1]) if review else None, 
                    'Rating': float(rating[-1].strip().split()[0]) if rating else None,
                    'Logo': logo.strip() if logo else '//cdn.trustpilot.net/consumer-site/placeholder_logo.svg',
                    'Urls': 'https://www.trustpilot.com' + url.strip()
                }
        
        next_page = response.xpath('//a[@data-pagination-button-next-paginationlink="true"]/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

class ReviewsSpider(scrapy.Spider):
    name = 'subcategories'
    page_counter = 0
    max_page_limit = 50
    def start_requests(self):
        df = pd.read_csv(r'companies.csv').drop_duplicates(subset='Urls', keep='first')
        for url in df['Urls']:
            yield scrapy.Request(str(url))
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ROBOTSTXT_OBEY': False,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'companies_reviews.csv',
        'CONCURRENT_REQUESTS': 32
    }
    
    def parse(self, response):
        logo = response.xpath('//img[@class="business-unit-profile-summary__image"]/@src').extract_first()
        name = response.xpath('//span[@class="multi-size-header__big"]/text()').extract_first()
        website = response.xpath('//div[@class="badge-card"]/a/@href').extract_first().split('?')[0]
        for reviews_content in response.xpath('//div[@class="review-content"]'):
            comment_text = reviews_content.xpath('.//p[@class="review-content__text"]/text()').extract_first()
            comment_title = reviews_content.xpath('.//h2[@class="review-content__title"]/a/text()').extract_first()
            rating = reviews_content.xpath('.//div[@class="star-rating star-rating--medium"]/img/@alt').extract_first()
            yield {
                'Company_Name': name.strip() if name else website,
                'Company_Site': website.strip() if website else None,
                'Company_Logo': logo.strip() if logo else '//cdn.trustpilot.net/consumer-site/placeholder_logo.svg',
                'Comment': comment_text.strip() if comment_text else comment_title,
                'Rating': int(rating.split('star')[0]) if rating else -1
            }
        
        next_page = response.xpath('//a[@class="button button--primary next-page"]/@href').extract_first()
        if next_page is not None and 'page='+str(self.max_page_limit+1) not in next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

%%time
@defer.inlineCallbacks
def crawl():
    #yield runner.crawl(CategoriesSpider)
    #yield runner.crawl(SubcategoriesSpider)
    #yield runner.crawl(CompaniesSpider)
    yield runner.crawl(ReviewsSpider)
    reactor.stop()


configure_logging()
config = get_project_settings()
runner = CrawlerRunner(settings=config)
crawl()
reactor.run()

2020-03-14 18:54:18 [scrapy.crawler] INFO: Overridden settings: {'CONCURRENT_REQUESTS': 32, 'FEED_FORMAT': 'csv', 'FEED_URI': 'companies_reviews.csv', 'LOG_LEVEL': 30}

Wall time: 2h 28min 22s

df = pd.read_csv(r'companies_reviews.csv')
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2865324 entries, 0 to 2865323
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   Company_Name  object
 1   Company_Site  object
 2   Company_Logo  object
 3   Comment       object
 4   Rating        int64 
dtypes: int64(1), object(4)
memory usage: 109.3+ MB

import matplotlib.pyplot as plt
df.Rating.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x15a9d77c948>

df.Rating.value_counts()

5    2174278
1     271560
4     255292
3      98008
2      66186
Name: Rating, dtype: int64

	Company_Name	Company_Site	Company_Logo	Comment	Rating
0	Fragrancebuy Canada	http://fragrancebuy.ca	//s3-eu-west-1.amazonaws.com/tpd/logos/597befb...	Excellent service	5
1	Fragrancebuy Canada	http://fragrancebuy.ca	//s3-eu-west-1.amazonaws.com/tpd/logos/597befb...	Bought perfume as i was given a gift card , ho...	1
2	Fragrancebuy Canada	http://fragrancebuy.ca	//s3-eu-west-1.amazonaws.com/tpd/logos/597befb...	A new Fragrance Buy customer. I recently purc...	5
3	Fragrancebuy Canada	http://fragrancebuy.ca	//s3-eu-west-1.amazonaws.com/tpd/logos/597befb...	Excellent fast delivery to the USA. Packaging ...	5
4	Fragrancebuy Canada	http://fragrancebuy.ca	//s3-eu-west-1.amazonaws.com/tpd/logos/597befb...	Excellence experience!! First time buyer from ...	5

Billy Gustave

Trustpilot - Data Mining

Data Gathering / Web scrapping with Scrapy ¶

Contact Me

www.linkedin.com/in/billygustave

billygustave.com

Billy Gustave