{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "## **Web Scraping** the unofficial **HackerOne** disclosure timeline.\n", "```\n", "Link Web Scraping :\n", "http://h1.nobbd.de/\n", "\n", "Code By Natasya \n", "medium.com/@liontin\n", "```" ] }, { "cell_type": "code", "execution_count": 271, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import numpy as np\n", "import plotly.express as px" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = []\n", "w = []\n", "url = 'http://h1.nobbd.de/index.php?start='\n", "for page in np.arange(0, 11840, (20)): \n", " req = requests.get(url + str(page))\n", " soup = BeautifulSoup(req.text, 'html.parser')\n", " h1 = soup.find_all('div',attrs={'class','report-wrapper'})\n", " for hack in h1:\n", " h2 = hack.find_all(\"div\",attrs={\"class\",\"report\"})\n", " for i in h2:\n", " layanan = i.find_all('b')[0].text.strip()\n", " try : report = i.find(\"a\", {\"class\": \"title\"})['title']\n", " except : report = ''\n", " bug_hunter = i.find_all('a')[1].text.strip()\n", " mirror = i.find(\"a\", {\"class\": \"title\"})['href']\n", " data.append({'Company': layanan , 'Title': report, 'Submit': bug_hunter, 'link': mirror})\n", " for d in h1:\n", " h3 = d.find_all(\"div\",attrs={\"class\",\"date\"})\n", " for date in h3:\n", " waktu = date.text.strip().split('wont-fix')\n", " w.append({'Date' : waktu})" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame(data)\n", "df2 = pd.DataFrame(w)" ] }, { "cell_type": "code", "execution_count": 274, "metadata": {}, "outputs": [], "source": [ "h1_disclosure = pd.merge(df1, df2, left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CompanyTitleSubmitlinkDate
11805HackerOneCSP not consistently appliedjanpaul123https://hackerone.com/reports/321[30 Nov 2013]
11806HackerOnePixel flood attackdutchgraahttps://hackerone.com/reports/390[30 Nov 2013]
11807HackerOneGIF floodingdutchgraahttps://hackerone.com/reports/400[30 Nov 2013]
11808RubyRuby: Heap Overflow in Floating Point Parsingcharliesomehttps://hackerone.com/reports/499[22 Nov 2013]
11809The InternetOpenSSH: Memory corruption in AES-GCM supportmarkushttps://hackerone.com/reports/500[07 Nov 2013]
\n", "
" ], "text/plain": [ " Company Title \\\n", "11805 HackerOne CSP not consistently applied \n", "11806 HackerOne Pixel flood attack \n", "11807 HackerOne GIF flooding \n", "11808 Ruby Ruby: Heap Overflow in Floating Point Parsing \n", "11809 The Internet OpenSSH: Memory corruption in AES-GCM support \n", "\n", " Submit link Date \n", "11805 janpaul123 https://hackerone.com/reports/321 [30 Nov 2013] \n", "11806 dutchgraa https://hackerone.com/reports/390 [30 Nov 2013] \n", "11807 dutchgraa https://hackerone.com/reports/400 [30 Nov 2013] \n", "11808 charliesome https://hackerone.com/reports/499 [22 Nov 2013] \n", "11809 markus https://hackerone.com/reports/500 [07 Nov 2013] " ] }, "execution_count": 286, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h1_disclosure.tail()" ] } ], "metadata": { "interpreter": { "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90" }, "kernelspec": { "display_name": "Python 2.7.16 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }