verseagility/scraper/2_extract.py

222 строки
6.9 KiB
Python
Исходник Обычный вид История

2020-08-24 17:24:50 +03:00
""" MICROSOFT FORUM TICKET SCRAPER
Website: answers.microsoft.com
Example:
> python 2_extract.py --language de-de --product windows
"""
2020-01-24 21:07:28 +03:00
import re
import urllib
import urllib.request
import json
import time
from bs4 import BeautifulSoup
import re
import sys
import os
import os.path
import time
import unidecode
import codecs
from requests import get
import uuid
import argparse
# Run arguments
parser = argparse.ArgumentParser()
parser.add_argument("--language",
2020-01-24 21:07:28 +03:00
default="de-de",
type=str,
help="'en-us' or 'de-de")
parser.add_argument('--product',
default='list',
2020-01-24 21:07:28 +03:00
type=str,
help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo']")
2020-01-24 21:07:28 +03:00
args = parser.parse_args()
2020-01-24 21:07:28 +03:00
# Set params
lang = args.language
productsel = args.product
2020-01-24 21:07:28 +03:00
# Extract text content
def getText(soup):
texts = []
try:
text = soup.find_all("div", "thread-message-content-body-text thread-full-message")
for item in text:
texts.append(item.text)
except:
texts = ""
return texts
# Clean text a little bit
def cleanText(text):
text = text.replace("\r", "").replace("\n", "")
text = ' '.join(text.split())
return text
# Title
def getTitle(soup):
title = soup.find_all("h1", "c-heading-3")[0].text
title = cleanText(title)
return title
# Check whether the case has been closed
def getDone(soup, text):
if soup.find_all("div", "answered-icon-desc"):
a_done = "true"
elif not soup.find_all("div", "answered-icon-desc") and len(text) > 1:
a_done = "false"
else:
a_done = ""
return a_done
# Get username
def getUsernameQuestion(soup):
name_question = soup.find_all("a", "c-hyperlink message-user-info-link user-name-show-white-space")[0].text
return name_question
# Get username of answer (not used)
def getUsernameAnswer(soup):
name_answer = soup.find_all("a", "c-hyperlink message-user-info-link user-name-show-white-space")[1].text
return name_answer
# Create date of question
def getDateQuestion(soup):
date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\nCréé le ", "").replace("\nCreado el ", "").replace("\nCreato il ", "").replace("\n", "")
2020-01-24 21:07:28 +03:00
return date_question
# Create date of answer
def getDateAnswer(soup):
2020-02-18 16:43:58 +03:00
date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\nRépondu le ", "").replace("\nRespondió el ", "").replace("\nRisposta il ", "").replace("\n", "")
2020-01-24 21:07:28 +03:00
return date_answer
# Get number of same cases
def getSame(soup):
same = soup.find_all("div", "thread-message-content-footer-message-action-link")[1].text
same_number = re.findall(r'\d+', same)[0]
return int(same_number)
# Get helpful score of answer
def getHelp(soup):
helpful = soup.find_all("p", "c-paragraph-4 message-voting-text vote-message-default")[0].text
helpful_number = re.findall(r'\d+', helpful)[0]
return int(helpful_number)
# Get views of post
def getViews(soup):
views = soup.find_all("span", id="threadQuestionInfoViews")[0].text
views_number = re.findall(r'\d+', views)[0]
return int(views_number)
# Get post tags
def getTags(soup, product):
tags = []
try:
tag = soup.find_all("ul", id="threadQuestionInfoAppliesToItems")
for item in tag:
subtag = item.find_all("a", "c-hyperlink")
for subitem in subtag:
tags.append(subitem.text.replace(', ', '_').replace(' ', '_'))
2020-01-24 21:07:28 +03:00
except:
tags = ""
return f'{product.capitalize()},{",".join(tags)}'
2020-01-24 21:07:28 +03:00
# Put it all together
def scrapeMe(url, product):
2020-02-18 16:43:58 +03:00
print(f"[URL] - {url}")
# GET WEBSITE
2020-01-24 21:07:28 +03:00
try:
response = get(url)
except Exception as e:
print(f"[ERROR] - There is an issue with the respective website -> {e}.")
2020-01-24 21:07:28 +03:00
html_soup = BeautifulSoup(response.text, 'html.parser')
fileid = uuid.uuid4().hex
2020-02-18 16:43:58 +03:00
# GET TEXT
2020-01-24 21:07:28 +03:00
text = getText(html_soup)
q_text = cleanText(text[0])
2020-02-18 16:43:58 +03:00
# GET META
2020-01-24 21:07:28 +03:00
q_title = getTitle(html_soup)
q_user = getUsernameQuestion(html_soup)
q_date = getDateQuestion(html_soup)
q_views = getViews(html_soup)
q_tags = getTags(html_soup, product)
q_same = getSame(html_soup)
2020-02-18 16:43:58 +03:00
# PACK Q JSON
2020-01-24 21:07:28 +03:00
question = {}
question['title'] = q_title
question['author'] = q_user
question['createdAt'] = q_date
question['text'] = q_text
question['upvotes'] = q_same
2020-02-18 16:43:58 +03:00
# CHECK IF DONE
2020-01-24 21:07:28 +03:00
a_done = getDone(html_soup, text)
2020-02-18 16:43:58 +03:00
# HANDLE IF NO ANSWER
2020-01-24 21:07:28 +03:00
if len(text) < 2:
a_date = ""
a_text = ""
a_upvotes = ""
else:
a_date = getDateAnswer(html_soup)
a_text = cleanText(text[1])
try:
a_upvotes = a_upvotes = getHelp(html_soup)
except:
a_upvotes = 0
# PACK A JSON
answer = {}
answer['markedAsAnswer'] = a_done
answer['createdAt'] = a_date
answer['text'] = a_text
try:
answer['upvotes'] = a_upvotes
except:
answer['upvotes'] = 0
2020-02-18 16:43:58 +03:00
# PACK JSON
2020-01-24 21:07:28 +03:00
data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}
content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)
2020-02-18 16:43:58 +03:00
# WRITE TO JSON FILE
with open(f"output-{lang}.json", "a", encoding='utf-8') as file:
2020-01-24 21:07:28 +03:00
file.write(content+",")
print(f"[SUCCESS] - File {fileid}\n")
2020-01-24 21:07:28 +03:00
2020-08-24 17:24:50 +03:00
# LOOP THROUGH THE OUTPUT TEXT FILES AND CREATE JSON
## Check mode
if productsel == "list":
products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'musicandvideo', 'msteams', 'microsoftedge']
else:
products = [productsel]
2020-08-24 17:24:50 +03:00
## Loop through product
for product in products:
2020-01-24 21:07:28 +03:00
try:
2020-08-24 17:24:50 +03:00
### Read File
docs = codecs.open(f"output-{product}-{lang}.txt", 'r', encoding='utf-8').read()
2020-08-24 17:24:50 +03:00
### Prepare Links
2020-02-18 16:43:58 +03:00
url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + r'/' + product + r'/forum/[^\s]+)', docs)
url_temp2 = [s.strip('"') for s in url_temp]
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
2020-08-24 17:24:50 +03:00
### Drop duplicates
2020-02-18 16:43:58 +03:00
url_list = list(dict.fromkeys(url_list))
failed_url = []
for i, value in enumerate(url_list):
i += 1
try:
print(f'[STATUS] - {product}, {i}/{len(url_list)}.')
scrapeMe(value, product)
except Exception as e:
failed_url.append(value)
print(f'[ERROR] - Failed to extract {value}.\n')
continue
print(f"[DONE] - List for {product} of failed URLs: {failed_url},\n{len(url_list) - len(failed_url)} successfully extracted.")
os.rename(f"output-{product}-{lang}.txt", f"_output-{product}-{lang}.txt")
except:
print(f"[ERROR] - 'output-{product}-{lang}.txt' does not exist.\n")