{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "\n", "A small notebook example on how to scrape data from pangaea\n", "\n", "## Imports \n", "we start off by importing the libraries we need\n", "\n", "- request_html -> used for parsing html + javascript websites\n", "- ..." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from requests_html import AsyncHTMLSession\n", "from bs4 import BeautifulSoup\n", "import urllib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Define a Query / search\n", "\n", "Here put the query you would look for on the website" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "query = 'Mass Accumulation Rate/Flux'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Run the code" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "10\n", "20\n", "30\n", "40\n", "50\n", "60\n", "70\n", "80\n", "90\n", "100\n", "110\n", "120\n", "130\n", "140\n", "150\n", "160\n", "170\n", "180\n", "190\n", "200\n", "210\n", "220\n", "230\n", "240\n", "250\n", "260\n", "270\n", "280\n", "290\n", "300\n", "310\n", "320\n", "330\n", "340\n", "350\n", "360\n", "370\n", "380\n", "390\n", "400\n", "410\n", "420\n", "430\n", "440\n", "450\n", "460\n", "470\n", "480\n", "490\n", "500\n", "510\n", "520\n", "530\n", "540\n", "550\n", "560\n", "570\n", "580\n", "590\n", "600\n", "610\n", "620\n", "630\n", "640\n", "650\n", "660\n", "670\n", "680\n", "690\n", "700\n", "710\n", "720\n", "730\n", "740\n", "750\n", "760\n", "770\n", "780\n", "790\n", "800\n", "810\n", "820\n", "830\n", "840\n", "850\n", "860\n", "870\n", "880\n", "890\n", "900\n", "910\n", "920\n", "930\n", "940\n", "950\n", "960\n", "970\n", "980\n", "990\n", "1000\n", "1010\n", "1020\n", "1030\n", "1040\n", "1050\n", "1060\n", "1070\n", "1080\n", "1090\n", "1100\n", "1110\n", "1120\n", "1130\n", "1140\n", "1150\n", "1160\n", "1170\n", "1180\n", "1190\n", "1200\n", "1210\n", "1220\n", "1230\n", "1240\n", "1250\n", "1260\n", "1270\n", "1280\n", "1290\n", "1300\n", "1310\n", "1320\n", "1330\n", "1340\n", "1350\n", "1360\n", "1370\n", "1380\n", "1390\n", "1400\n", "1410\n", "1420\n", "1430\n", "1440\n", "1450\n", "1460\n", "1470\n", "1480\n", "1490\n", "1500\n", "1510\n", "1520\n", "1530\n", "1540\n", "1550\n", "1560\n", "1570\n", "1580\n", "1590\n", "1600\n", "1610\n", "1620\n", "1630\n", "1640\n", "1650\n", "1660\n", "1670\n", "1680\n", "1690\n", "1700\n", "1710\n", "1720\n", "1730\n", "1740\n", "1750\n", "1760\n", "1770\n", "1780\n", "1790\n", "1800\n", "1810\n", "1820\n", "1830\n", "1840\n", "1850\n", "1860\n", "1870\n", "1880\n", "1890\n", "1900\n", "1910\n", "1920\n", "1930\n", "1940\n", "1950\n", "1960\n", "1970\n", "1980\n", "1990\n", "2000\n", "2010\n", "2020\n", "2030\n", "2040\n", "2050\n", "2060\n", "2070\n", "2080\n", "2090\n", "2100\n", "2110\n", "2120\n", "2130\n", "2140\n", "2150\n", "2160\n", "2170\n", "2180\n", "2190\n", "2200\n", "2210\n", "2220\n", "2230\n", "2240\n", "2250\n", "2260\n", "2270\n", "2280\n", "2290\n", "2300\n", "2310\n", "2320\n", "2330\n", "2340\n" ] } ], "source": [ "# Store links\n", "links = []\n", "URL = 'https://www.pangaea.de/?q={query}&offset={offset}'\n", "url_query = urllib.parse.quote_plus(query)\n", "finished = False\n", "offset=0\n", "while not finished:\n", " print(offset)\n", " retry = 5\n", " while retry > 0:\n", " asession = AsyncHTMLSession()\n", " r = await asession.get(URL.format(query=url_query, offset=offset))\n", " await r.html.arender()\n", " resp= r.html.raw_html\n", " soup = BeautifulSoup(resp, 'html.parser')\n", " dataset_links = soup.find_all('a', class_='dataset-link')\n", " if len(dataset_links):\n", " retry = -1\n", " retry -= 1\n", " if not len(dataset_links):\n", " finished = True\n", " for anchor in dataset_links:\n", " link = anchor.get('href')\n", " link = \"/\".join(link.split('/')[-2:])\n", " links.append(link)\n", " offset += 10" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "with open('links.txt', 'w') as f:\n", " f.write(\"\\n\".join(links))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (pangeo)", "language": "python", "name": "python-pangeo" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }