BeautifulSoup to scrap information from Indeed<\/a> last time, so we can reuse the technique again. This time, we apply it on 3 job search engines. Let’ start from Indeed:<\/p>\n\n\n\nsearch_keyword_arr =[\"Java\", \"Python\", \"HTML\", \"SQL\", \"JavaScript\", \"C#\", \"PHP\", \"CSS\", \".NET\", \"angular\", \"react\"] \nlocation = \"\"\nurl_prefix = \"https:\/\/www.indeed.com\"\npre_fix_text = \"Page 1 of \"\nindeed_developer_jobs = []\nfor search_keyword in search_keyword_arr: \n params = {\n 'q':search_keyword+' developer',\n 'l':location\n } \n url = url_prefix + \"\/jobs?\"+urllib.parse.urlencode(params)\n html = urllib.request.urlopen(url)\n soup = BeautifulSoup(html, 'lxml') \n search_count = soup.find('div', {'id':'searchCount'}) \n salary_title_txt = soup.find('div', {'id':'univsrch-salary-title'}).get_text()\n salary_txt = soup.find('p', {'id':'univsrch-salary-currentsalary'}).get_text() \n salary_value = salary_txt[:salary_txt.find(' per year')].replace('$', '').replace(',', '')\n search_count_txt = search_count.get_text().strip()\n if search_count_txt.startswith(pre_fix_text):\n job_count_txt = search_count_txt[len(pre_fix_text):search_count_txt.find('jobs')]\n job_count_value = int(re.sub(',','', job_count_txt)) \n indeed_developer_job = {\"search engine\":\"Indeed\", \"language\":search_keyword, \"job count\":job_count_value, \"salary\": salary_value}\n indeed_developer_jobs.append(indeed_developer_job)\n print(\"Keyword:{}\\tJob count:{}\\t{}:{}\".format(search_keyword, job_count_txt, salary_title_txt, salary_txt))<\/pre>\n\n\n\nKeyword:Java\tJob count:27,939 \tJava Developer salaries in United States:$102,430 per year\nKeyword:Python\tJob count:25,846 \tPython Developer salaries in United States:$122,654 per year\nKeyword:HTML\tJob count:15,383 \tWeb Developer salaries in United States:$76,253 per year\nKeyword:SQL\tJob count:29,565 \tSQL Developer salaries in United States:$84,555 per year\nKeyword:JavaScript\tJob count:27,197 \tJavascript Developer salaries in United States:$111,243 per year\nKeyword:C#\tJob count:20,880 \t.NET Developer salaries in United States:$91,681 per year\nKeyword:PHP\tJob count:9,280 \tPHP Developer salaries in United States:$89,089 per year\nKeyword:CSS\tJob count:17,491 \tDeveloper salaries in United States:$97,116 per year\nKeyword:.NET\tJob count:13,997 \t.NET Developer salaries in United States:$91,681 per year\nKeyword:angular\tJob count:11,079 \tFront End Developer salaries in United States:$108,408 per year\nKeyword:react\tJob count:11,415 \tDeveloper salaries in United States:$97,116 per year<\/code><\/pre>\n\n\n\nNow we go for Glassdoor:<\/p>\n\n\n\n
url_prefix = \"https:\/\/www.glassdoor.com\"\nglassdoor_developer_jobs = []\nfor search_keyword in search_keyword_arr: \n url = url_prefix + \"\/Job\/us-{}-developer-jobs-SRCH_IL.0,2_IN1_KO3,{}.htm\".format(re.sub('#','', search_keyword) , len(search_keyword+' developer')+3)\n req = urllib.request.Request(url, headers={'User-Agent' : \"Magic Browser\"}) \n html = urllib.request.urlopen( req )\n soup = BeautifulSoup(html, 'lxml') \n search_count = soup.find('p', {'class':'jobsCount'}) \n search_count_txt = search_count.get_text().strip()\n search_count_value = int(search_count_txt[:search_count_txt.find('Jobs')].replace(',', ''))\n url = url_prefix + \"\/Salaries\/us-{}-developer-salary-SRCH_IL.0,2_IN1_KO3,{}.htm\".format(re.sub('#','', search_keyword) , len(search_keyword+' developer')+3)\n req = urllib.request.Request(url, headers={'User-Agent' : \"Magic Browser\"}) \n html = urllib.request.urlopen( req )\n soup = BeautifulSoup(html, 'lxml') \n salary_detail = soup.find('div', {'class':'OccMedianBasePayStyle__payDetails'}) \n if (salary_detail == None):\n salary_detail = soup.find('div', {'id': 'MeanPay_N'}) \n if (salary_detail == None): \n salary_detail_txt = \"N\/A\"\n salary_value = 0 \n else:\n salary_detail_txt = salary_detail.get_text().strip()\n salary_value = int(salary_detail_txt.split(\"$\",1)[1].replace(\",\",\"\").replace(\"\/yr\", \"\").replace(\"*\",\"\"))\n glassdoor_developer_job = {\"search engine\":\"Glassdoor\", \"language\":search_keyword, \"job count\":search_count_value, \"salary\": salary_value}\n glassdoor_developer_jobs.append(glassdoor_developer_job)\n print(\"Keyword:{}\\tJob count:{}\\t{}\".format(search_keyword, search_count_txt, salary_detail_txt))\n<\/pre>\n\n\n\nKeyword:Java\tJob count:62,454 Jobs\tAverage Base Pay$88,116\/yr\nKeyword:Python\tJob count:57,045 Jobs\tAverage Base Pay$92,000\/yr\nKeyword:HTML\tJob count:11,002 Jobs\t$66,912*\nKeyword:SQL\tJob count:22,496 Jobs\tAverage Base Pay$84,779\/yr\nKeyword:JavaScript\tJob count:23,989 Jobs\tAverage Base Pay$72,500\/yr\nKeyword:C#\tJob count:6,877 Jobs\tAverage Base Pay$95,052\/yr\nKeyword:PHP\tJob count:8,471 Jobs\tAverage Base Pay$93,987\/yr\nKeyword:CSS\tJob count:9,223 Jobs\tN\/A\nKeyword:.NET\tJob count:4,546 Jobs\tAverage Base Pay$95,052\/yr\nKeyword:angular\tJob count:9,751 Jobs\tN\/A\nKeyword:react\tJob count:7,707 Jobs\tN\/A<\/code><\/pre>\n\n\n\nAnd finally, we have Monster: <\/p>\n\n\n\n
url_prefix = \"https:\/\/www.monster.com\"\nmonster_developer_jobs = []\nfor search_keyword in search_keyword_arr: \n params = {\n 'q':search_keyword+' developer'\n } \n url = url_prefix + \"\/jobs\/search\/?\"+urllib.parse.urlencode(params)\n html = urllib.request.urlopen(url)\n soup = BeautifulSoup(html, 'lxml') \n search_count = soup.find('h2', {'class':'figure'}) \n search_count_txt = search_count.get_text().strip()\n search_count_txt = (re.search( \"[^\\(](.+?)(?=Jobs)\", search_count_txt)).group()\n monster_developer_job = {\"search engine\":\"Monster\", \"language\":search_keyword, \"job count\":int(search_count_txt), \"salary\": 0}\n monster_developer_jobs.append(monster_developer_job) \n print(\"Keyword:{}\\tJob count:{}\".format(search_keyword, search_count_txt))\n<\/pre>\n\n\n\nKeyword:Java\tJob count:82576 \nKeyword:Python\tJob count:40367 \nKeyword:HTML\tJob count:19999 \nKeyword:SQL\tJob count:37206 \nKeyword:JavaScript\tJob count:32062 \nKeyword:C#\tJob count:23025 \nKeyword:PHP\tJob count:3353 \nKeyword:CSS\tJob count:21743 \nKeyword:.NET\tJob count:16587 \nKeyword:angular\tJob count:11362 \nKeyword:react\tJob count:10006 <\/code><\/pre>\n\n\n\nPlease note that there is no salary information from Monster.<\/p>\n\n\n\n
Language EDA on Job Market<\/h3>\n\n\n\n
Now we have data from 3 job search engines, we can use it to plot a Job Market chart. First, let’s transform the data into a data frame.<\/p>\n\n\n\n
developer_jobs = indeed_developer_jobs+glassdoor_developer_jobs+monster_developer_jobs\ndeveloper_jobs_df = pd.DataFrame(developer_jobs)\n<\/pre>\n\n\n\nThen we calculate the mean of job count and salary for different languages.<\/p>\n\n\n\n
language_jobcount_df = (developer_jobs_df.groupby([\"language\"])[\"job count\"].mean().reset_index()).sort_values(\"job count\", ascending=False\ndeveloper_jobs_wo_zero_df = developer_jobs_df.loc[developer_jobs_df['salary'] !=0]\nlanguage_salary_df = developer_jobs_wo_zero_df.groupby([\"language\"])[\"salary\"].mean().reset_index()\nlanguage_jobcount_salary_df = language_jobcount_df.merge(language_salary_df, on=[\"language\"], how='left')\n<\/pre>\n\n\n\nAnd we plot a chart using plotly. <\/p>\n\n\n\n
trace = go.Bar(\n x = language_jobcount_salary_df[\"language\"],\n y = language_jobcount_salary_df[\"job count\"],\n name=\"Job Count\",\n marker=dict(color = \"royalblue\")\n ) \ntrace2 = go.Scatter(\n x = language_jobcount_salary_df[\"language\"],\n y = language_jobcount_salary_df[\"salary\"],\n yaxis='y2',\n name=\"Salary\",\n marker=dict(color = \"orangered\")\n ) \nlayout = go.Layout(\n title = \"Programming Lanaguage on Job Market\",\n xaxis=dict(\n title=\"Language\",\n tickangle=45,\n ),\n yaxis=dict(\n title=\"Job Count\"\n ), \n yaxis2=dict(\n title=\"Salary per year\",\n titlefont=dict(\n color=\"orangered\"\n ),\n tickfont=dict(\n color=\"orangered\"\n ),\n overlaying='y',\n side='right'\n ),\n legend=dict(orientation=\"h\"),\n width = 800, \n height = 500\n )\ndata = [trace, trace2]\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, config={'showLink': True})\n<\/pre>\n\n\n\nHere it comes:<\/p>\n\n\n\n