#DeepLearning #SupervisedLearning #CNN
By Billy Gustave
Goal
import scrapy, logging, pandas as pd
from scrapy.crawler import CrawlerRunner
from os import path
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
class CategoriesSpider(scrapy.Spider):
name = 'categories'
start_urls = ['https://www.trustpilot.com/categories']
recorded_urls = []
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ROBOTSTXT_OBEY': False,
'FEED_FORMAT': 'csv',
'FEED_URI': 'categories.csv'
}
def parse(self, response):
for category in response.xpath('//div[@class="subCategory___BRUDy"]/h3/a'):
url = category.xpath('./@href').extract_first()
if url and url not in self.recorded_urls:
self.recorded_urls.append(url)
categoryText = category.xpath('./span/text()').extract_first()
filename = url.strip().split('/categories/')[1]
yield {
'Categories': categoryText.strip() if categoryText else filename,
'File_Names': filename,
'Subcategories': None,
'Urls': 'https://www.trustpilot.com' + url.strip()
}
class SubcategoriesSpider(scrapy.Spider):
name = 'subcategories'
def start_requests(self):
df = pd.read_csv(r'categories.csv').drop_duplicates(subset='Urls', keep='first')
for url in df['Urls']:
yield scrapy.Request(str(url))
recorded_urls = []
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ROBOTSTXT_OBEY': False,
'FEED_FORMAT': 'csv',
'FEED_URI': 'subcategories.csv'
}
def parse(self, response):
category = response.xpath('//div[@class="listCurrentMidLevel___2Tpg7"]/label/text()').extract_first()
for subcategory in response.xpath('//ul[@class="subCategoriesList___1O-nM"]/li/a'):
url = subcategory.xpath('./@href').extract_first().strip()
if url and url != '#' and url not in self.recorded_urls:
self.recorded_urls.append(url)
subcategoryText = subcategory.xpath('./span/text()').extract_first()
yield {
'Categories': category.strip() if category else None,
'Subcategories': subcategoryText.strip() if subcategoryText else None,
'Urls':'https://www.trustpilot.com' + url
}
class CompaniesSpider(scrapy.Spider):
name = 'subcategories'
def start_requests(self):
cat_df = pd.read_csv(r'categories.csv').drop_duplicates(subset='Urls', keep='first')
subcat_df = pd.read_csv(r'subcategories.csv').drop_duplicates(subset='Urls', keep='first')
all_df = pd.concat([cat_df.drop('File_Names', axis=1), subcat_df]).drop_duplicates(subset='Urls', keep='first')
for url in all_df['Urls']:
yield scrapy.Request(str(url))
recorded_urls = []
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ROBOTSTXT_OBEY': False,
'FEED_FORMAT': 'csv',
'FEED_URI': 'companies.csv'
}
def parse(self, response):
companies_section = response.xpath(
'//div[contains(@class,"businessUnitCardsContainer___1iAt9") or contains(@class,"businessUnitCardsContainer___1Ez9Z")]')
for company in companies_section.xpath('.//a[@class="wrapper___2rOTx"]'):
url = company.xpath('./@href').extract_first()
if url and url not in self.recorded_urls:
self.recorded_urls.append(url)
companyText = company.xpath('.//div[@class="businessTitle___152-c"]/text()').extract_first()
review = company.xpath('.//div[@class="textRating___3F1NO"]/text()').extract_first()
rating = company.xpath('.//div[@class="textRating___3F1NO"]/text()').extract()
logo = company.xpath('.//div[@class="logoBox___2MR-l"]/img/@src').extract_first()
yield {
'Company': companyText.strip() if companyText else url.strip().split('/review/')[-1],
'Reviews': int(review.strip().split()[1]) if review else None,
'Rating': float(rating[-1].strip().split()[0]) if rating else None,
'Logo': logo.strip() if logo else '//cdn.trustpilot.net/consumer-site/placeholder_logo.svg',
'Urls': 'https://www.trustpilot.com' + url.strip()
}
next_page = response.xpath('//a[@data-pagination-button-next-paginationlink="true"]/@href').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
class ReviewsSpider(scrapy.Spider):
name = 'subcategories'
page_counter = 0
max_page_limit = 50
def start_requests(self):
df = pd.read_csv(r'companies.csv').drop_duplicates(subset='Urls', keep='first')
for url in df['Urls']:
yield scrapy.Request(str(url))
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ROBOTSTXT_OBEY': False,
'FEED_FORMAT': 'csv',
'FEED_URI': 'companies_reviews.csv',
'CONCURRENT_REQUESTS': 32
}
def parse(self, response):
logo = response.xpath('//img[@class="business-unit-profile-summary__image"]/@src').extract_first()
name = response.xpath('//span[@class="multi-size-header__big"]/text()').extract_first()
website = response.xpath('//div[@class="badge-card"]/a/@href').extract_first().split('?')[0]
for reviews_content in response.xpath('//div[@class="review-content"]'):
comment_text = reviews_content.xpath('.//p[@class="review-content__text"]/text()').extract_first()
comment_title = reviews_content.xpath('.//h2[@class="review-content__title"]/a/text()').extract_first()
rating = reviews_content.xpath('.//div[@class="star-rating star-rating--medium"]/img/@alt').extract_first()
yield {
'Company_Name': name.strip() if name else website,
'Company_Site': website.strip() if website else None,
'Company_Logo': logo.strip() if logo else '//cdn.trustpilot.net/consumer-site/placeholder_logo.svg',
'Comment': comment_text.strip() if comment_text else comment_title,
'Rating': int(rating.split('star')[0]) if rating else -1
}
next_page = response.xpath('//a[@class="button button--primary next-page"]/@href').extract_first()
if next_page is not None and 'page='+str(self.max_page_limit+1) not in next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
%%time
@defer.inlineCallbacks
def crawl():
#yield runner.crawl(CategoriesSpider)
#yield runner.crawl(SubcategoriesSpider)
#yield runner.crawl(CompaniesSpider)
yield runner.crawl(ReviewsSpider)
reactor.stop()
configure_logging()
config = get_project_settings()
runner = CrawlerRunner(settings=config)
crawl()
reactor.run()
df = pd.read_csv(r'companies_reviews.csv')
df.head()
df.info()
import matplotlib.pyplot as plt
df.Rating.hist()
df.Rating.value_counts()