{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## **Web Scraping** in **zone-h** \n", "```\n", "Code using python3\n", "for Jupyter Notebook\n", "\n", "Work in VS Code\n", "Not for google colab\n", "\n", "Link Web Scraping :\n", "http://zone-h.org\n", "\n", "Code By Natasya \n", "medium.com/@liontin\n", "```" ] }, { "cell_type": "code", "execution_count": 330, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup as Natasya\n", "import pandas as liontin\n", "import csv\n", "import plotly.express as zs\n" ] }, { "cell_type": "code", "execution_count": 331, "metadata": {}, "outputs": [], "source": [ "pagestart = input('Page Number') # start dari page ke berapa?\n", "pagestop = input('Page Number') # stop page ke berapa?\n", "savefile = input('File Name') # nama file yang akan disimpan (tanpa ekstensi)\n", "start = int(pagestart)\n", "stop = int(pagestop)\n", "with open(savefile+'.csv', 'a') as csvfile:\n", " writecsv = csv.writer(csvfile)\n", " writecsv.writerow([\"web\", \"Attacker\", \"waktu\", \"Arsip\"])\n", " for page in range(start, stop+1):\n", "\n", " #silahkan sesuaikan cookie pada website zone-h.org saat diakses\n", "\n", " myCookie = {\"PHPSESSID\": \"11qetovqgsvqklutldop2icmf3\", \"ZHE\": \"6cef7232835ba0ab755503ce1c2efedd\"}\n", " url = \"https://www.zone-h.org/archive/filter=1/published=0/special=1/domain=go.id/fulltext=1/page=%s\" % page\n", " req = requests.session()\n", "\n", " data = req.get(url, cookies=myCookie)\n", " dataH = data.content\n", " getdata = Natasya(dataH, 'html.parser')\n", " table = getdata.find('table')\n", " getlist = table.find_all('tr', class_=None)[1:]\n", " \n", " for ZH in getlist:\n", " kolom = ZH.findAll('td')\n", " if len(kolom) > 1:\n", " halaman = kolom[7].text.strip()\n", " attacker = kolom[1].text.strip()\n", " kejadian = kolom[0].text.replace('/', '-')\n", " arsip = kolom[9].find('a').get('href')\n", " writecsv.writerow([halaman, attacker, kejadian, str('www.zone-h.org%s' % arsip)])\n", " " ] }, { "cell_type": "code", "execution_count": 332, "metadata": {}, "outputs": [], "source": [ "zone_H = pd.read_csv(savefile+'.csv') # data csv disesuaikan dengan nama file sebelumnya yang telah disave" ] }, { "cell_type": "code", "execution_count": 333, "metadata": {}, "outputs": [], "source": [ "zoneH = liontin.DataFrame(zone_H)" ] }, { "cell_type": "code", "execution_count": 334, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | web | \n", "Attacker | \n", "waktu | \n", "Arsip | \n", "
---|---|---|---|---|
0 | \n", "satpammantap.binmas.metro.polr... | \n", "K4PUYU4K | \n", "2022-01-26 | \n", "www.zone-h.org/mirror/id/39331180 | \n", "
1 | \n", "polreskapuas.kalteng.polri.go.... | \n", "K4PUYU4K | \n", "2022-01-26 | \n", "www.zone-h.org/mirror/id/39331179 | \n", "
2 | \n", "polresbatu.jatim.polri.go.id/s... | \n", "K4PUYU4K | \n", "2022-01-26 | \n", "www.zone-h.org/mirror/id/39331178 | \n", "
3 | \n", "kapuas.kalteng.polri.go.id/pol... | \n", "K4PUYU4K | \n", "2022-01-26 | \n", "www.zone-h.org/mirror/id/39331177 | \n", "
4 | \n", "dprd.sukabumikota.go.id/a.txt | \n", "galehdotid | \n", "2022-01-26 | \n", "www.zone-h.org/mirror/id/39329599 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
70 | \n", "disperta.madiunkota.go.id/b4.html | \n", "0x1998 | \n", "2022-01-13 | \n", "www.zone-h.org/mirror/id/39259138 | \n", "
71 | \n", "itk.dprdsulsel.go.id | \n", "SABUNMANDI CYBER TEAM | \n", "2022-01-13 | \n", "www.zone-h.org/mirror/id/39258780 | \n", "
72 | \n", "pupr.padangpariamankab.go.id | \n", "XNUXER ACHILL | \n", "2022-01-13 | \n", "www.zone-h.org/mirror/id/39258682 | \n", "
73 | \n", "bpbiabiyoso.kemsos.go.id/index... | \n", "elv1n4 | \n", "2022-01-13 | \n", "www.zone-h.org/mirror/id/39258521 | \n", "
74 | \n", "epipad.probolinggokab.go.id/in... | \n", "MR.5T1Y0 | \n", "2022-01-12 | \n", "www.zone-h.org/mirror/id/39257787 | \n", "
75 rows × 4 columns
\n", "