From c8f5037ab361b1d4d22472d68c419edc38d76466 Mon Sep 17 00:00:00 2001
From: Sreeja Deb <52777362+Sreeja-Deb@users.noreply.github.com>
Date: Mon, 27 Sep 2021 12:02:46 +0530
Subject: [PATCH] Basic Featurisation + Similarity in text

---
 .../TextAnalytics-BasicFeaturisation_Similarity.ipynb            | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Notebooks/Text Analytics/TextAnalytics-BasicFeaturisation_Similarity.ipynb
diff --git a/Notebooks/Text Analytics/TextAnalytics-BasicFeaturisation_Similarity.ipynb b/Notebooks/Text Analytics/TextAnalytics-BasicFeaturisation_Similarity.ipynb
new file mode 100644
index 0000000..0c402ff
--- /dev/null
+++ b/Notebooks/Text Analytics/TextAnalytics-BasicFeaturisation_Similarity.ipynb	
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","source":["%scala\nval filepath1= \"abfss://<Your ADLS Blob>.dfs.core.windows.net/mldata/Twitter_NLP/train_tweets.csv\"\nvar df1=spark.read.format(\"csv\").option(\"header\", \"true\").option(\"delimiter\", \",\").load(filepath1)\ndf1.createOrReplaceTempView(\"train_twitter\")\n\nval filepath2= \"abfss://<Your ADLS Blob>.dfs.core.windows.net/mldata/Twitter_NLP/test_tweets.csv\"\nvar df2=spark.read.format(\"csv\").option(\"header\", \"true\").option(\"delimiter\", \",\").load(filepath2)\ndf2.createOrReplaceTempView(\"test_twitter\")\n\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Read Train & Test datasets- Twitter","showTitle":true,"inputWidgets":{},"nuid":"ea682da2-0638-474f-9868-6bdc4475cd80"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["train_twitter= spark.sql(\"\"\"select * from train_twitter\"\"\")\ntest_twitter= spark.sql(\"\"\"select * from test_twitter\"\"\")\n\ntrain_twitter = train_twitter.toPandas()\noutdir = '/dbfs/FileStore/train_twitter.csv'\ntrain_twitter.to_csv(outdir, index=False)\n\ntest_twitter = test_twitter.toPandas()\noutdir = '/dbfs/FileStore/test_twitter.csv'\ntest_twitter.to_csv(outdir, index=False)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Any column manipulation + pandas conversion","showTitle":true,"inputWidgets":{},"nuid":"80a1a9c3-6448-4203-b90c-10eafdfac515"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"code","source":["%pip install nltk\n%pip install textblob"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Install NLP nltk module","showTitle":true,"inputWidgets":{},"nuid":"c0d5d633-19c4-4bd7-94e8-9895486f4355"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">Python interpreter will be restarted.\nCollecting nltk\n  Downloading nltk-3.6.3-py3-none-any.whl (1.5 MB)\nRequirement already satisfied: joblib in /databricks/python3/lib/python3.7/site-packages (from nltk) (0.14.1)\nCollecting tqdm\n  Downloading tqdm-4.62.2-py2.py3-none-any.whl (76 kB)\nCollecting regex\n  Downloading regex-2021.8.28-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (745 kB)\nCollecting click\n  Downloading click-8.0.1-py3-none-any.whl (97 kB)\nCollecting importlib-metadata\n  Downloading importlib_metadata-4.8.1-py3-none-any.whl (17 kB)\nCollecting typing-extensions&gt;=3.6.4\n  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)\nCollecting zipp&gt;=0.5\n  Downloading zipp-3.5.0-py3-none-any.whl (5.7 kB)\nInstalling collected packages: zipp, typing-extensions, importlib-metadata, tqdm, regex, click, nltk\nSuccessfully installed click-8.0.1 importlib-metadata-4.8.1 nltk-3.6.3 regex-2021.8.28 tqdm-4.62.2 typing-extensions-3.10.0.2 zipp-3.5.0\nPython interpreter will be restarted.\nPython interpreter will be restarted.\nCollecting textblob\n  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)\nRequirement already satisfied: nltk&gt;=3.1 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from textblob) (3.6.3)\nRequirement already satisfied: joblib in /databricks/python3/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (0.14.1)\nRequirement already satisfied: tqdm in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (4.62.2)\nRequirement already satisfied: regex in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (2021.8.28)\nRequirement already satisfied: click in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (8.0.1)\nRequirement already satisfied: importlib-metadata in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from click-&gt;nltk&gt;=3.1-&gt;textblob) (4.8.1)\nRequirement already satisfied: typing-extensions&gt;=3.6.4 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from importlib-metadata-&gt;click-&gt;nltk&gt;=3.1-&gt;textblob) (3.10.0.2)\nRequirement already satisfied: zipp&gt;=0.5 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from importlib-metadata-&gt;click-&gt;nltk&gt;=3.1-&gt;textblob) (3.5.0)\nInstalling collected packages: textblob\nSuccessfully installed textblob-0.15.3\nPython interpreter will be restarted.\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Python interpreter will be restarted.\nCollecting nltk\n  Downloading nltk-3.6.3-py3-none-any.whl (1.5 MB)\nRequirement already satisfied: joblib in /databricks/python3/lib/python3.7/site-packages (from nltk) (0.14.1)\nCollecting tqdm\n  Downloading tqdm-4.62.2-py2.py3-none-any.whl (76 kB)\nCollecting regex\n  Downloading regex-2021.8.28-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (745 kB)\nCollecting click\n  Downloading click-8.0.1-py3-none-any.whl (97 kB)\nCollecting importlib-metadata\n  Downloading importlib_metadata-4.8.1-py3-none-any.whl (17 kB)\nCollecting typing-extensions&gt;=3.6.4\n  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)\nCollecting zipp&gt;=0.5\n  Downloading zipp-3.5.0-py3-none-any.whl (5.7 kB)\nInstalling collected packages: zipp, typing-extensions, importlib-metadata, tqdm, regex, click, nltk\nSuccessfully installed click-8.0.1 importlib-metadata-4.8.1 nltk-3.6.3 regex-2021.8.28 tqdm-4.62.2 typing-extensions-3.10.0.2 zipp-3.5.0\nPython interpreter will be restarted.\nPython interpreter will be restarted.\nCollecting textblob\n  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)\nRequirement already satisfied: nltk&gt;=3.1 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from textblob) (3.6.3)\nRequirement already satisfied: joblib in /databricks/python3/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (0.14.1)\nRequirement already satisfied: tqdm in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (4.62.2)\nRequirement already satisfied: regex in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (2021.8.28)\nRequirement already satisfied: click in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from nltk&gt;=3.1-&gt;textblob) (8.0.1)\nRequirement already satisfied: importlib-metadata in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from click-&gt;nltk&gt;=3.1-&gt;textblob) (4.8.1)\nRequirement already satisfied: typing-extensions&gt;=3.6.4 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from importlib-metadata-&gt;click-&gt;nltk&gt;=3.1-&gt;textblob) (3.10.0.2)\nRequirement already satisfied: zipp&gt;=0.5 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5696162e-edf7-4e07-b896-ada6aa53ee56/lib/python3.7/site-packages (from importlib-metadata-&gt;click-&gt;nltk&gt;=3.1-&gt;textblob) (3.5.0)\nInstalling collected packages: textblob\nSuccessfully installed textblob-0.15.3\nPython interpreter will be restarted.\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":["import warnings\nwarnings.filterwarnings('ignore')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"111bb0f3-07ca-4f35-92e0-ea1f96acb73a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"code","source":["import pandas as pd\nimport numpy as np\ntrain = pd.read_csv(\"/dbfs/FileStore/train_twitter.csv\", header='infer')\ntest = pd.read_csv(\"/dbfs/FileStore/test_twitter.csv\", header='infer')\ntext_col='tweet'\ndf=train\n############################################\n\n\nprint('\\n\\n****************TEXT DATA****************\\n\\n')\nprint(df[text_col].head())\n\nprint('\\n\\n1. BASIC FEATURE EXTRACTION\\n\\n')\n\n##Number of Words (Intution- generally the negative sentiments contain a lesser amount of words than the positive ones)\nprint('\\n\\n****************WORD COUNT****************\\n\\n')\ntrain['word_count'] = df[text_col].apply(lambda x: len(str(x).split(\" \")))\nprint(df[[text_col,'word_count']].head())\n\n##Number of characters (This is done by calculating the length of the text, includes spaces)\nprint('\\n\\n****************NUMBER OF CHARACTERS****************\\n\\n')\ndf['char_count'] = df[text_col].str.len() \nprint(df[[text_col,'char_count']].head())\n\n##Average Word Length( sum of the length of all the words and divide it by the total length of the text or total word count in text)\nprint('\\n\\n****************AVERAGE WORD LENGTH****************\\n\\n')\ndf['avg_word'] = df[text_col].apply(lambda x: np.mean([len(w) for w in x.split(\" \")]))\nprint(df[[text_col,'avg_word']].head())\n\n##Count of Special Charachters/ Numbers (str.isalpha() method is used to check if all characters in each string in series are alphabetic(a-z/A-Z))\nprint('\\n\\n****************SPECIAL CHARACHTERS COUNT****************\\n\\n')\ndf[\"special_char\"] = df[text_col].apply(lambda p: sum( not q.isalpha() for q in p ))\ndf['numerics'] = df[text_col].apply(lambda x: len([x for x in x.split() if x.isdigit()]))\nprint(df[[text_col,'special_char','numerics']].head())\n\n##Number of Upper case words (Anger/ extreme emotions in text are often upper case)\nprint('\\n\\n****************UPPER CASE WORDS COUNT****************\\n\\n')\ndf['upper'] = df[text_col].apply(lambda x: len([x for x in x.split() if x.isupper()]))\nprint(df[[text_col,'upper']].head())\n\n\n\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"BASIC FEATURE EXTRACTION","showTitle":true,"inputWidgets":{},"nuid":"9ba788d6-d42b-4d2b-88c0-3f721d645ccd"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">\n\n****************TEXT DATA****************\n\n\n0     @user when a father is dysfunctional and is s...\n1    @user @user thanks for #lyft credit i can&#39;t us...\n2                                  bihday your majesty\n3    #model   i love u take with u all the time in ...\n4               factsguide: society now    #motivation\nName: tweet, dtype: object\n\n\n1. BASIC FEATURE EXTRACTION\n\n\n\n\n****************WORD COUNT****************\n\n\n                                               tweet  word_count\n0   @user when a father is dysfunctional and is s...          21\n1  @user @user thanks for #lyft credit i can&#39;t us...          22\n2                                bihday your majesty           5\n3  #model   i love u take with u all the time in ...          17\n4             factsguide: society now    #motivation           8\n\n\n****************NUMBER OF CHARACTERS****************\n\n\n                                               tweet  char_count\n0   @user when a father is dysfunctional and is s...         102\n1  @user @user thanks for #lyft credit i can&#39;t us...         122\n2                                bihday your majesty          21\n3  #model   i love u take with u all the time in ...          86\n4             factsguide: society now    #motivation          39\n\n\n****************AVERAGE WORD LENGTH****************\n\n\n                                               tweet  avg_word\n0   @user when a father is dysfunctional and is s...  3.904762\n1  @user @user thanks for #lyft credit i can&#39;t us...  4.590909\n2                                bihday your majesty  3.400000\n3  #model   i love u take with u all the time in ...  4.117647\n4             factsguide: society now    #motivation  4.000000\n\n\n****************SPECIAL CHARACHTERS COUNT****************\n\n\n                                               tweet  special_char  numerics\n0   @user when a father is dysfunctional and is s...            23         0\n1  @user @user thanks for #lyft credit i can&#39;t us...            29         0\n2                                bihday your majesty             4         0\n3  #model   i love u take with u all the time in ...            44         0\n4             factsguide: society now    #motivation             9         0\n\n\n****************UPPER CASE WORDS COUNT****************\n\n\n                                               tweet  upper\n0   @user when a father is dysfunctional and is s...      0\n1  @user @user thanks for #lyft credit i can&#39;t us...      0\n2                                bihday your majesty      0\n3  #model   i love u take with u all the time in ...      0\n4             factsguide: society now    #motivation      0\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">\n\n****************TEXT DATA****************\n\n\n0     @user when a father is dysfunctional and is s...\n1    @user @user thanks for #lyft credit i can&#39;t us...\n2                                  bihday your majesty\n3    #model   i love u take with u all the time in ...\n4               factsguide: society now    #motivation\nName: tweet, dtype: object\n\n\n1. BASIC FEATURE EXTRACTION\n\n\n\n\n****************WORD COUNT****************\n\n\n                                               tweet  word_count\n0   @user when a father is dysfunctional and is s...          21\n1  @user @user thanks for #lyft credit i can&#39;t us...          22\n2                                bihday your majesty           5\n3  #model   i love u take with u all the time in ...          17\n4             factsguide: society now    #motivation           8\n\n\n****************NUMBER OF CHARACTERS****************\n\n\n                                               tweet  char_count\n0   @user when a father is dysfunctional and is s...         102\n1  @user @user thanks for #lyft credit i can&#39;t us...         122\n2                                bihday your majesty          21\n3  #model   i love u take with u all the time in ...          86\n4             factsguide: society now    #motivation          39\n\n\n****************AVERAGE WORD LENGTH****************\n\n\n                                               tweet  avg_word\n0   @user when a father is dysfunctional and is s...  3.904762\n1  @user @user thanks for #lyft credit i can&#39;t us...  4.590909\n2                                bihday your majesty  3.400000\n3  #model   i love u take with u all the time in ...  4.117647\n4             factsguide: society now    #motivation  4.000000\n\n\n****************SPECIAL CHARACHTERS COUNT****************\n\n\n                                               tweet  special_char  numerics\n0   @user when a father is dysfunctional and is s...            23         0\n1  @user @user thanks for #lyft credit i can&#39;t us...            29         0\n2                                bihday your majesty             4         0\n3  #model   i love u take with u all the time in ...            44         0\n4             factsguide: society now    #motivation             9         0\n\n\n****************UPPER CASE WORDS COUNT****************\n\n\n                                               tweet  upper\n0   @user when a father is dysfunctional and is s...      0\n1  @user @user thanks for #lyft credit i can&#39;t us...      0\n2                                bihday your majesty      0\n3  #model   i love u take with u all the time in ...      0\n4             factsguide: society now    #motivation      0\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":["#import pandas as pd\n#import numpy as np\n#train = pd.read_csv(\"/dbfs/FileStore/train_twitter.csv\", header='infer')\n#test = pd.read_csv(\"/dbfs/FileStore/test_twitter.csv\", header='infer')\n#text_col='tweet'\n#df=train\n############################################\n\n\nprint('\\n\\n****************TEXT DATA****************\\n\\n')\nprint(df[text_col].head())\n\nprint('\\n\\n 2. BASIC PRE-PROCESSING\\n\\n')\n\n##Structural standardisation (Lower case every word as 'INDIA', 'India' treated differently)\n#x = \"#\".join(myList)-->x returns the List elements as string separated by ''#''\nprint('\\n\\n****************CONVERT TO LOWER CASE****************\\n\\n')\ndf[text_col] = df[text_col].apply(lambda x: \" \".join(x.lower() for x in x.split()))  \nprint(df[text_col].head())\n\n##Remove Punctuations (as it doesn’t add any extra information while treating text data)\n#\\w=[a-zA-Z0-9_], \\s=Unicode whitespace characters (which includes [\\t\\n\\r\\f\\v]\nprint('\\n\\n****************REMOVE PUNCTUATIONS****************\\n\\n')\ndf[text_col] = df[text_col].str.replace('[^\\w\\s]','')\nprint(df[text_col].head())\n\n##Stop words (or commonly occurring English words) removal (add no extra information to text data)\nprint('\\n\\n****************STOP WORDS REMOVAL****************\\n\\n')\nfrom nltk.corpus import stopwords\nimport nltk\nnltk.download('stopwords')\nnltk.download('punkt')\nstop = stopwords.words('english')\ndf[text_col] = df[text_col].apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\nprint(df[text_col].head())\n\n##Most frequent words appearing throughout corpus removal (as their presence will not of any use in classification of our text data)\n#Get top 10 most frequent words\nprint('\\n\\n****************MOST FREQUENT WORDS REMOVAL****************\\n\\n')\nfreq = pd.Series(' '.join(df[text_col]).split()).value_counts()[:10]\nfreq = list(freq.index)\nprint(\"Most Frequent Words: \",freq)\ntrain[text_col] = train[text_col].apply(lambda x: \" \".join(x for x in x.split() if x not in freq))\nprint(train[text_col].head())\n\n##Rare Words removal (Because they’re so rare, the association between them and other words is dominated by noise)\n#Get top 10 least frequent words\nprint('\\n\\n****************MOST RARE WORDS REMOVAL****************\\n\\n')\nfreq = pd.Series(' '.join(train[text_col]).split()).value_counts()[-10:]\nfreq = list(freq.index)\nprint(\"Least Frequent Words: \",freq)\ntrain[text_col] = train[text_col].apply(lambda x: \" \".join(x for x in x.split() if x not in freq))\nprint(train[text_col].head())\n\n## Spelling correction (this also will help us in reducing multiple copies of same words and treating them differently)\n#Take a lot of time to make these corrections. Limit operation to first five rows for testing\n#We should also keep in mind that words are often used in their abbreviated form. For instance, ‘your’ is used as ‘ur’. We should treat this before the spelling correction step, otherwise these words might be transformed into any other word like the one shown below: 'ur' used for 'your' --Changed to-->'or'\nprint('\\n\\n****************SPELLING CORRECTION****************\\n\\n')\nfrom textblob import TextBlob\ntrain[text_col][:5] = train[text_col][:5].apply(lambda x: \" \".join(str(TextBlob(x).correct()) for x in x.split())) \nprint(train[text_col].head())\n\n##Stemming (removal of suffices, like “ing”, “ly”, “s” etc to get the base word out of different forms of the same word)\n#Take a lot of time to make these corrections. Limit operation to first five rows for testing\nprint('\\n\\n****************STEMMING****************\\n\\n')\nfrom nltk.stem import PorterStemmer\nst = PorterStemmer()\ntrain[text_col][:5] = train[text_col][:5].apply(lambda x: \" \".join(st.stem(word) for word in x.split()))\nprint(train[text_col].head())\n\n##Lemmatization (It is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices)\nprint('\\n\\n****************LEMMATISATION****************\\n\\n')\nfrom textblob import Word\nnltk.download('wordnet')\ntrain[text_col] = train[text_col].apply(lambda x: \" \".join(Word(word).lemmatize() for word in x.split()))\nprint(train[text_col].head())\n\n##N-Grams Identification\nprint('\\n\\n****************N-Grams****************\\n\\n')\n#Get Top 10 N-Grams\ntext=' '.join(df[text_col].values)\nfrom nltk.collocations import *\ntokens = nltk.word_tokenize(text)\nbigram_measures = nltk.collocations.BigramAssocMeasures()\nfinder = BigramCollocationFinder.from_words(tokens)\nTop_bigrams=sorted(finder.ngram_fd.items(),key=lambda x: x[1],reverse=True)\nprint(\"Top 10 N-Grams :\",Top_bigrams[:10])\n\n##Sentiment Analysis\n#Take a lot of time to make these corrections. Limit operation to first five rows for testing\n#Returns a tuple representing polarity and subjectivity of each tweet. Here, we only extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1 means a negative sentiment. This can also work as a feature for building a machine learning model.\nprint('\\n\\n****************SENTIMENT ANALYSIS****************\\n\\n')\ndf['sentiment']=\"\"\ndf['sentiment'][:5] = df[text_col][:5].apply(lambda x: TextBlob(x).sentiment[0])\nprint(df[[text_col,'sentiment']].head())\n\n\n############################################\nprint('\\n\\n****************COMBINED COLUMNS****************\\n\\n')\n##Combine all columns together to feed into vectorizer\ncols = [text_col,'word_count','char_count','avg_word','special_char','numerics','upper','sentiment']\ndf['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)\nprint(df['combined'].head())\n\n##TF-IDF Scoring\n#Use this vector to get cosine similarity between the text rows\nprint('\\n\\n****************TF-IDF VECTORIZE****************\\n\\n')\nnltk.download('punkt')\nfrom sklearn.feature_extraction.text import TfidfVectorizer\ntfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word' ,stop_words= 'english' ,ngram_range=(1,1))\ntrain_vect = tfidf.fit_transform(df['combined'])\ntfidf_df=pd.DataFrame(train_vect.toarray(), index= df.index.to_list())\nprint(\"Shape of Text Converted to vector after adding tf-idf scores :\",tfidf_df.shape)\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"BASIC PRE-PROCESSING","showTitle":true,"inputWidgets":{},"nuid":"dda12214-b927-4f08-9bce-639622191ba1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">\n\n****************TEXT DATA****************\n\n\n0     @user when a father is dysfunctional and is s...\n1    @user @user thanks for #lyft credit i can&#39;t us...\n2                                  bihday your majesty\n3    #model   i love u take with u all the time in ...\n4               factsguide: society now    #motivation\nName: tweet, dtype: object\n\n\n 2. BASIC PRE-PROCESSING\n\n\n\n\n****************CONVERT TO LOWER CASE****************\n\n\n0    @user when a father is dysfunctional and is so...\n1    @user @user thanks for #lyft credit i can&#39;t us...\n2                                  bihday your majesty\n3    #model i love u take with u all the time in ur...\n4                  factsguide: society now #motivation\nName: tweet, dtype: object\n\n\n****************REMOVE PUNCTUATIONS****************\n\n\n0    user when a father is dysfunctional and is so ...\n1    user user thanks for lyft credit i cant use ca...\n2                                  bihday your majesty\n3    model i love u take with u all the time in urð...\n4                    factsguide society now motivation\nName: tweet, dtype: object\n\n\n****************STOP WORDS REMOVAL****************\n\n\n[nltk_data] Downloading package stopwords to /root/nltk_data...\n[nltk_data]   Package stopwords is already up-to-date!\n[nltk_data] Downloading package punkt to /root/nltk_data...\n[nltk_data]   Package punkt is already up-to-date!\n0    user father dysfunctional selfish drags kids d...\n1    user user thanks lyft credit cant use cause do...\n2                                       bihday majesty\n3                model love u take u time urð ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************MOST FREQUENT WORDS REMOVAL****************\n\n\nMost Frequent Words:  [&#39;user&#39;, &#39;love&#39;, &#39;ð&#39;, &#39;day&#39;, &#39;â&#39;, &#39;happy&#39;, &#39;amp&#39;, &#39;im&#39;, &#39;u&#39;, &#39;time&#39;]\n0    father dysfunctional selfish drags kids dysfun...\n1    thanks lyft credit cant use cause dont offer w...\n2                                       bihday majesty\n3                              model take urð ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************MOST RARE WORDS REMOVAL****************\n\n\nLeast Frequent Words:  [&#39;ptrish_g32&#39;, &#39;alfinahooy&#39;, &#39;rodeo&#39;, &#39;restandrelaxationtime&#39;, &#39;blessingsðð½&#39;, &#39;bbggirlsâ&#39;, &#39;beastmoderoarbiascanlifestyle&#39;, &#39;bestabiamo&#39;, &#39;ageless&#39;, &#39;loveâïâï&#39;]\n0    father dysfunctional selfish drags kids dysfun...\n1    thanks lyft credit cant use cause dont offer w...\n2                                       bihday majesty\n3                              model take urð ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************SPELLING CORRECTION****************\n\n\n0    father dysfunctional selfish drags kiss dysfun...\n1    thanks left credit can use cause dont offer wh...\n2                                       midday majesty\n3                               model take or ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************STEMMING****************\n\n\n0       father dysfunct selfish drag kiss dysfunct run\n1    thank left credit can use caus dont offer whee...\n2                                       midday majesti\n3                               model take or ðððð ððð\n4                              factsguid societi motiv\nName: tweet, dtype: object\n\n\n****************LEMMATISATION****************\n\n\n[nltk_data] Downloading package wordnet to /root/nltk_data...\n[nltk_data]   Package wordnet is already up-to-date!\n0       father dysfunct selfish drag kiss dysfunct run\n1    thank left credit can use caus dont offer whee...\n2                                       midday majesti\n3                               model take or ðððð ððð\n4                              factsguid societi motiv\nName: tweet, dtype: object\n\n\n****************N-Grams****************\n\n\nTop 10 N-Grams : [((&#39;thankful&#39;, &#39;positive&#39;), 404), ((&#39;positive&#39;, &#39;affirmation&#39;), 352), ((&#39;cant&#39;, &#39;wait&#39;), 330), ((&#39;model&#39;, &#39;take&#39;), 325), ((&#39;ðððð&#39;, &#39;ððð&#39;), 325), ((&#39;take&#39;, &#39;urð&#39;), 324), ((&#39;urð&#39;, &#39;ðððð&#39;), 324), ((&#39;i_am&#39;, &#39;positive&#39;), 304), ((&#39;blog&#39;, &#39;silver&#39;), 295), ((&#39;silver&#39;, &#39;gold&#39;), 283)]\n\n\n****************SENTIMENT ANALYSIS****************\n\n\n                                               tweet sentiment\n0     father dysfunct selfish drag kiss dysfunct run      -0.3\n1  thank left credit can use caus dont offer whee...         0\n2                                     midday majesti         0\n3                             model take or ðððð ððð         0\n4                            factsguid societi motiv         0\n\n\n****************COMBINED COLUMNS****************\n\n\n0    father dysfunct selfish drag kiss dysfunct run...\n1    thank left credit can use caus dont offer whee...\n2                    midday majesti 5 21 3.4 4 0 0 0.0\n3    model take or ðððð ððð 17 86 4.117647058823529...\n4           factsguid societi motiv 8 39 4.0 9 0 0 0.0\nName: combined, dtype: object\n\n\n****************TF-IDF VECTORIZE****************\n\n\n[nltk_data] Downloading package punkt to /root/nltk_data...\n[nltk_data]   Package punkt is already up-to-date!\nShape of Text Converted to vector after adding tf-idf scores : (31962, 1000)\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">\n\n****************TEXT DATA****************\n\n\n0     @user when a father is dysfunctional and is s...\n1    @user @user thanks for #lyft credit i can&#39;t us...\n2                                  bihday your majesty\n3    #model   i love u take with u all the time in ...\n4               factsguide: society now    #motivation\nName: tweet, dtype: object\n\n\n 2. BASIC PRE-PROCESSING\n\n\n\n\n****************CONVERT TO LOWER CASE****************\n\n\n0    @user when a father is dysfunctional and is so...\n1    @user @user thanks for #lyft credit i can&#39;t us...\n2                                  bihday your majesty\n3    #model i love u take with u all the time in ur...\n4                  factsguide: society now #motivation\nName: tweet, dtype: object\n\n\n****************REMOVE PUNCTUATIONS****************\n\n\n0    user when a father is dysfunctional and is so ...\n1    user user thanks for lyft credit i cant use ca...\n2                                  bihday your majesty\n3    model i love u take with u all the time in urð...\n4                    factsguide society now motivation\nName: tweet, dtype: object\n\n\n****************STOP WORDS REMOVAL****************\n\n\n[nltk_data] Downloading package stopwords to /root/nltk_data...\n[nltk_data]   Package stopwords is already up-to-date!\n[nltk_data] Downloading package punkt to /root/nltk_data...\n[nltk_data]   Package punkt is already up-to-date!\n0    user father dysfunctional selfish drags kids d...\n1    user user thanks lyft credit cant use cause do...\n2                                       bihday majesty\n3                model love u take u time urð ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************MOST FREQUENT WORDS REMOVAL****************\n\n\nMost Frequent Words:  [&#39;user&#39;, &#39;love&#39;, &#39;ð&#39;, &#39;day&#39;, &#39;â&#39;, &#39;happy&#39;, &#39;amp&#39;, &#39;im&#39;, &#39;u&#39;, &#39;time&#39;]\n0    father dysfunctional selfish drags kids dysfun...\n1    thanks lyft credit cant use cause dont offer w...\n2                                       bihday majesty\n3                              model take urð ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************MOST RARE WORDS REMOVAL****************\n\n\nLeast Frequent Words:  [&#39;ptrish_g32&#39;, &#39;alfinahooy&#39;, &#39;rodeo&#39;, &#39;restandrelaxationtime&#39;, &#39;blessingsðð½&#39;, &#39;bbggirlsâ&#39;, &#39;beastmoderoarbiascanlifestyle&#39;, &#39;bestabiamo&#39;, &#39;ageless&#39;, &#39;loveâïâï&#39;]\n0    father dysfunctional selfish drags kids dysfun...\n1    thanks lyft credit cant use cause dont offer w...\n2                                       bihday majesty\n3                              model take urð ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************SPELLING CORRECTION****************\n\n\n0    father dysfunctional selfish drags kiss dysfun...\n1    thanks left credit can use cause dont offer wh...\n2                                       midday majesty\n3                               model take or ðððð ððð\n4                        factsguide society motivation\nName: tweet, dtype: object\n\n\n****************STEMMING****************\n\n\n0       father dysfunct selfish drag kiss dysfunct run\n1    thank left credit can use caus dont offer whee...\n2                                       midday majesti\n3                               model take or ðððð ððð\n4                              factsguid societi motiv\nName: tweet, dtype: object\n\n\n****************LEMMATISATION****************\n\n\n[nltk_data] Downloading package wordnet to /root/nltk_data...\n[nltk_data]   Package wordnet is already up-to-date!\n0       father dysfunct selfish drag kiss dysfunct run\n1    thank left credit can use caus dont offer whee...\n2                                       midday majesti\n3                               model take or ðððð ððð\n4                              factsguid societi motiv\nName: tweet, dtype: object\n\n\n****************N-Grams****************\n\n\nTop 10 N-Grams : [((&#39;thankful&#39;, &#39;positive&#39;), 404), ((&#39;positive&#39;, &#39;affirmation&#39;), 352), ((&#39;cant&#39;, &#39;wait&#39;), 330), ((&#39;model&#39;, &#39;take&#39;), 325), ((&#39;ðððð&#39;, &#39;ððð&#39;), 325), ((&#39;take&#39;, &#39;urð&#39;), 324), ((&#39;urð&#39;, &#39;ðððð&#39;), 324), ((&#39;i_am&#39;, &#39;positive&#39;), 304), ((&#39;blog&#39;, &#39;silver&#39;), 295), ((&#39;silver&#39;, &#39;gold&#39;), 283)]\n\n\n****************SENTIMENT ANALYSIS****************\n\n\n                                               tweet sentiment\n0     father dysfunct selfish drag kiss dysfunct run      -0.3\n1  thank left credit can use caus dont offer whee...         0\n2                                     midday majesti         0\n3                             model take or ðððð ððð         0\n4                            factsguid societi motiv         0\n\n\n****************COMBINED COLUMNS****************\n\n\n0    father dysfunct selfish drag kiss dysfunct run...\n1    thank left credit can use caus dont offer whee...\n2                    midday majesti 5 21 3.4 4 0 0 0.0\n3    model take or ðððð ððð 17 86 4.117647058823529...\n4           factsguid societi motiv 8 39 4.0 9 0 0 0.0\nName: combined, dtype: object\n\n\n****************TF-IDF VECTORIZE****************\n\n\n[nltk_data] Downloading package punkt to /root/nltk_data...\n[nltk_data]   Package punkt is already up-to-date!\nShape of Text Converted to vector after adding tf-idf scores : (31962, 1000)\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":["\"\"\"\n#Get cosine similarity between two texts\nStep1. Convert the text into vector of numbers (Using TF-IDF scores)\n       a)TF= Frequency of a word in the given sentence or Term-Frequency\n       b)IDF=Inverse Doc Frequency is 1/ number of times a word appears accross all documents. This is important because some words like is/am/are/the are present throughout the text and add no value/variability when present in a sentence. So allot these words a lower score by taking the inverse. We can ignore the IDF score as we have removed the stop words and most frequent words accross.\n       c)TF-IDF score =TF score * IDF score\n       d)text_to_vector function returns a tuple of { word: Frequency } or TF score. Thus converts a text to vector.\n       \nStep2.Calculate cosine similarity of the two vectors\n      a)cos_sim(vectA,vectB)=dot product=(xa.xb + ya.yb + za.zb)/[(sqrt(xa.xa + ya.ya + za.za)).(sqrt(xb.xb + yb.yb + zb.zb))]\n      where vectA=(xa,ya,za) ; vectB=(xb,yb,zb)\n\"\"\"\nimport math\nimport re\nfrom collections import Counter\n\nWORD = re.compile(r\"\\w+\") \ndef get_cosine(vec1, vec2):\n    intersection = set(vec1.keys()) & set(vec2.keys())\n    numerator = sum([vec1[x] * vec2[x] for x in intersection])\n\n    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])\n    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])\n    denominator = math.sqrt(sum1) * math.sqrt(sum2)\n\n    if not denominator:\n        return 0.0\n    else:\n        return float(numerator) / denominator\n\n\ndef text_to_vector(text):\n    words = WORD.findall(text)\n    return Counter(words)\n\n#######################################################CALLING#############################################################\nprint(\"Text 1 :\",df['combined'][0])\nprint(\"Text 2 :\",df['combined'][1])\nprint(\"Text 3 :\",df['combined'][2])\n\ntext1 = df['combined'][0]\ntext2 = df['combined'][1]\ntext3 = df['combined'][2]\n\nvector1 = text_to_vector(text1)\nvector2 = text_to_vector(text2)\nvector3 = text_to_vector(text3)\n\ncosine12 = get_cosine(vector1, vector2)\nprint(\"Cosine Similarity between String 1 & String 2:\", cosine12)\n\ncosine13 = get_cosine(vector1, vector3)\nprint(\"Cosine Similarity between String 1 & String 3:\", cosine13)\n\n#More the cosine similarity the closer the two strings are"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Similarity between two strings","showTitle":true,"inputWidgets":{},"nuid":"48780ea4-0fb6-4dcb-8dba-cc315ad698d5"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">Text 1 : father dysfunct selfish drag kiss dysfunct run 21 102 3.9047619047619047 23 0 0 -0.3\nText 2 : thank left credit can use caus dont offer wheelchair van pox disappoint getthank 22 122 4.590909090909091 29 0 0 0.0\nText 3 : midday majesti 5 21 3.4 4 0 0 0.0\nCosine Similarity between String 1 &amp; String 2: 0.4036036763977875\nCosine Similarity between String 1 &amp; String 3: 0.5883484054145521\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Text 1 : father dysfunct selfish drag kiss dysfunct run 21 102 3.9047619047619047 23 0 0 -0.3\nText 2 : thank left credit can use caus dont offer wheelchair van pox disappoint getthank 22 122 4.590909090909091 29 0 0 0.0\nText 3 : midday majesti 5 21 3.4 4 0 0 0.0\nCosine Similarity between String 1 &amp; String 2: 0.4036036763977875\nCosine Similarity between String 1 &amp; String 3: 0.5883484054145521\n</div>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"TextAnalytics-BasicFeaturisation_Similarity (1)","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{"EnvType":{"nuid":"1d032462-cf28-4cf5-93de-11cbc9413fe2","currentValue":"DEV","widgetInfo":{"widgetType":"text","name":"EnvType","defaultValue":"DEV","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2301443945016483}},"nbformat":4,"nbformat_minor":0}