2020-08-24 17:24:50 +03:00
""" MICROSOFT FORUM TICKET SCRAPER
Website : answers . microsoft . com
Example :
> python 2 _extract . py - - language de - de - - product windows
"""
2020-01-24 21:07:28 +03:00
import re
import urllib
import urllib . request
import json
import time
from bs4 import BeautifulSoup
import re
import sys
import os
import os . path
import time
import unidecode
import codecs
from requests import get
import uuid
import argparse
# Run arguments
parser = argparse . ArgumentParser ( )
2020-02-14 17:42:52 +03:00
parser . add_argument ( " --language " ,
2020-01-24 21:07:28 +03:00
default = " de-de " ,
type = str ,
help = " ' en-us ' or ' de-de " )
parser . add_argument ( ' --product ' ,
2020-05-25 19:25:22 +03:00
default = ' list ' ,
2020-01-24 21:07:28 +03:00
type = str ,
2020-02-14 17:42:52 +03:00
help = " [ ' windows ' , ' msoffice ' , ' xbox ' , ' outlook_com ' , ' skype ' , ' surface ' , ' protect ' , ' edge ' , ' ie ' , ' musicandvideo ' ] " )
2020-01-24 21:07:28 +03:00
args = parser . parse_args ( )
2020-02-14 17:42:52 +03:00
2020-01-24 21:07:28 +03:00
# Set params
lang = args . language
2020-05-25 19:25:22 +03:00
productsel = args . product
2020-01-24 21:07:28 +03:00
# Extract text content
def getText ( soup ) :
texts = [ ]
try :
text = soup . find_all ( " div " , " thread-message-content-body-text thread-full-message " )
for item in text :
texts . append ( item . text )
except :
texts = " "
return texts
# Clean text a little bit
def cleanText ( text ) :
text = text . replace ( " \r " , " " ) . replace ( " \n " , " " )
text = ' ' . join ( text . split ( ) )
return text
# Title
def getTitle ( soup ) :
title = soup . find_all ( " h1 " , " c-heading-3 " ) [ 0 ] . text
title = cleanText ( title )
return title
# Check whether the case has been closed
def getDone ( soup , text ) :
if soup . find_all ( " div " , " answered-icon-desc " ) :
a_done = " true "
elif not soup . find_all ( " div " , " answered-icon-desc " ) and len ( text ) > 1 :
a_done = " false "
else :
a_done = " "
return a_done
# Get username
def getUsernameQuestion ( soup ) :
name_question = soup . find_all ( " a " , " c-hyperlink message-user-info-link user-name-show-white-space " ) [ 0 ] . text
return name_question
# Get username of answer (not used)
def getUsernameAnswer ( soup ) :
name_answer = soup . find_all ( " a " , " c-hyperlink message-user-info-link user-name-show-white-space " ) [ 1 ] . text
return name_answer
# Create date of question
def getDateQuestion ( soup ) :
2020-02-14 17:42:52 +03:00
date_question = soup . find_all ( " span " , " asking-text-asked-on-link " ) [ 0 ] . text . replace ( " \n Erstellt am " , " " ) . replace ( " \n Créé le " , " " ) . replace ( " \n Creado el " , " " ) . replace ( " \n Creato il " , " " ) . replace ( " \n " , " " )
2020-01-24 21:07:28 +03:00
return date_question
# Create date of answer
def getDateAnswer ( soup ) :
2020-02-18 16:43:58 +03:00
date_answer = soup . find_all ( " span " , " asking-text-asked-on-link " ) [ 1 ] . text . replace ( " \n Beantwortet am " , " " ) . replace ( " \n Répondu le " , " " ) . replace ( " \n Respondió el " , " " ) . replace ( " \n Risposta il " , " " ) . replace ( " \n " , " " )
2020-01-24 21:07:28 +03:00
return date_answer
# Get number of same cases
def getSame ( soup ) :
same = soup . find_all ( " div " , " thread-message-content-footer-message-action-link " ) [ 1 ] . text
same_number = re . findall ( r ' \ d+ ' , same ) [ 0 ]
return int ( same_number )
# Get helpful score of answer
def getHelp ( soup ) :
helpful = soup . find_all ( " p " , " c-paragraph-4 message-voting-text vote-message-default " ) [ 0 ] . text
helpful_number = re . findall ( r ' \ d+ ' , helpful ) [ 0 ]
return int ( helpful_number )
# Get views of post
def getViews ( soup ) :
views = soup . find_all ( " span " , id = " threadQuestionInfoViews " ) [ 0 ] . text
views_number = re . findall ( r ' \ d+ ' , views ) [ 0 ]
return int ( views_number )
# Get post tags
def getTags ( soup , product ) :
tags = [ ]
try :
tag = soup . find_all ( " ul " , id = " threadQuestionInfoAppliesToItems " )
for item in tag :
subtag = item . find_all ( " a " , " c-hyperlink " )
for subitem in subtag :
2020-05-25 19:25:22 +03:00
tags . append ( subitem . text . replace ( ' , ' , ' _ ' ) . replace ( ' ' , ' _ ' ) )
2020-01-24 21:07:28 +03:00
except :
tags = " "
2020-05-25 19:25:22 +03:00
return f ' { product . capitalize ( ) } , { " , " . join ( tags ) } '
2020-01-24 21:07:28 +03:00
# Put it all together
def scrapeMe ( url , product ) :
2020-02-18 16:43:58 +03:00
print ( f " [URL] - { url } " )
# GET WEBSITE
2020-01-24 21:07:28 +03:00
try :
response = get ( url )
2020-05-25 19:25:22 +03:00
except Exception as e :
print ( f " [ERROR] - There is an issue with the respective website -> { e } . " )
2020-01-24 21:07:28 +03:00
html_soup = BeautifulSoup ( response . text , ' html.parser ' )
fileid = uuid . uuid4 ( ) . hex
2020-02-18 16:43:58 +03:00
# GET TEXT
2020-01-24 21:07:28 +03:00
text = getText ( html_soup )
q_text = cleanText ( text [ 0 ] )
2020-02-18 16:43:58 +03:00
# GET META
2020-01-24 21:07:28 +03:00
q_title = getTitle ( html_soup )
q_user = getUsernameQuestion ( html_soup )
q_date = getDateQuestion ( html_soup )
q_views = getViews ( html_soup )
q_tags = getTags ( html_soup , product )
q_same = getSame ( html_soup )
2020-02-18 16:43:58 +03:00
# PACK Q JSON
2020-01-24 21:07:28 +03:00
question = { }
question [ ' title ' ] = q_title
question [ ' author ' ] = q_user
question [ ' createdAt ' ] = q_date
question [ ' text ' ] = q_text
question [ ' upvotes ' ] = q_same
2020-02-18 16:43:58 +03:00
# CHECK IF DONE
2020-01-24 21:07:28 +03:00
a_done = getDone ( html_soup , text )
2020-02-18 16:43:58 +03:00
# HANDLE IF NO ANSWER
2020-01-24 21:07:28 +03:00
if len ( text ) < 2 :
a_date = " "
a_text = " "
a_upvotes = " "
else :
a_date = getDateAnswer ( html_soup )
a_text = cleanText ( text [ 1 ] )
try :
a_upvotes = a_upvotes = getHelp ( html_soup )
except :
a_upvotes = 0
# PACK A JSON
answer = { }
answer [ ' markedAsAnswer ' ] = a_done
answer [ ' createdAt ' ] = a_date
answer [ ' text ' ] = a_text
try :
answer [ ' upvotes ' ] = a_upvotes
except :
answer [ ' upvotes ' ] = 0
2020-02-18 16:43:58 +03:00
# PACK JSON
2020-01-24 21:07:28 +03:00
data = { ' question ' : question , ' id ' : fileid , ' views ' : q_views , ' appliesTo ' : q_tags , ' url ' : url , ' language ' : lang , ' answer ' : answer }
content = json . dumps ( data , indent = 4 , separators = ( ' , ' , ' : ' ) , ensure_ascii = False )
2020-02-18 16:43:58 +03:00
# WRITE TO JSON FILE
2020-02-14 17:42:52 +03:00
with open ( f " output- { lang } .json " , " a " , encoding = ' utf-8 ' ) as file :
2020-01-24 21:07:28 +03:00
file . write ( content + " , " )
2020-02-14 17:42:52 +03:00
print ( f " [SUCCESS] - File { fileid } \n " )
2020-01-24 21:07:28 +03:00
2020-08-24 17:24:50 +03:00
# LOOP THROUGH THE OUTPUT TEXT FILES AND CREATE JSON
## Check mode
2020-05-25 19:25:22 +03:00
if productsel == " list " :
products = [ ' windows ' , ' msoffice ' , ' xbox ' , ' outlook_com ' , ' skype ' , ' surface ' , ' protect ' , ' edge ' , ' musicandvideo ' , ' msteams ' , ' microsoftedge ' ]
else :
products = [ productsel ]
2020-08-24 17:24:50 +03:00
## Loop through product
2020-02-14 17:42:52 +03:00
for product in products :
2020-01-24 21:07:28 +03:00
try :
2020-08-24 17:24:50 +03:00
### Read File
2020-02-14 17:42:52 +03:00
docs = codecs . open ( f " output- { product } - { lang } .txt " , ' r ' , encoding = ' utf-8 ' ) . read ( )
2020-08-24 17:24:50 +03:00
### Prepare Links
2020-02-18 16:43:58 +03:00
url_temp = re . findall ( r ' (https?://answers.microsoft.com/ ' + lang + r ' / ' + product + r ' /forum/[^ \ s]+) ' , docs )
2020-02-14 17:42:52 +03:00
url_temp2 = [ s . strip ( ' " ' ) for s in url_temp ]
url_list = [ x for x in url_temp2 if not x . endswith ( ' LastReply ' ) ]
2020-08-24 17:24:50 +03:00
### Drop duplicates
2020-02-18 16:43:58 +03:00
url_list = list ( dict . fromkeys ( url_list ) )
2020-02-14 17:42:52 +03:00
failed_url = [ ]
for i , value in enumerate ( url_list ) :
i + = 1
try :
2020-05-25 19:25:22 +03:00
print ( f ' [STATUS] - { product } , { i } / { len ( url_list ) } . ' )
2020-02-14 17:42:52 +03:00
scrapeMe ( value , product )
except Exception as e :
failed_url . append ( value )
2020-05-25 19:25:22 +03:00
print ( f ' [ERROR] - Failed to extract { value } . \n ' )
2020-02-14 17:42:52 +03:00
continue
2020-05-25 19:25:22 +03:00
print ( f " [DONE] - List for { product } of failed URLs: { failed_url } , \n { len ( url_list ) - len ( failed_url ) } successfully extracted. " )
os . rename ( f " output- { product } - { lang } .txt " , f " _output- { product } - { lang } .txt " )
2020-02-14 17:42:52 +03:00
except :
print ( f " [ERROR] - ' output- { product } - { lang } .txt ' does not exist. \n " )