diff --git a/notebooks/01_classification.ipynb b/notebooks/01_classification.ipynb index 2b40809..4ded6aa 100644 --- a/notebooks/01_classification.ipynb +++ b/notebooks/01_classification.ipynb @@ -12,6 +12,7 @@ " name: \"01_classification\"\n", " language: \"python\"\n", " display_name: \"01_classification\"\n", + "eval: false\n", "---" ] }, @@ -19,22 +20,134 @@ "cell_type": "code", "metadata": {}, "source": [ - "#| echo: true\n", - "#| code-fold: true\n", - "# Imports\n", - "import os\n", + "from datetime import datetime, timedelta\n", + "\n", "import xarray as xr\n", - "import numpy as np\n", - "import datetime as dt\n", + "import pystac_client\n", + "import stackstac\n", + "import odc.stac\n", + "from odc.geo.geobox import GeoBox\n", + "from dask.diagnostics import ProgressBar\n", + "from rasterio.enums import Resampling\n", + "\n", "import cmcrameri as cmc\n", - "import folium\n", - "from dotenv import dotenv_values\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from pathlib import Path\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "\n", + "# Python Script\n", + "from functions import preprocess_data_to_classify\n", + "\n", + "# Scikit Learn\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "import matplotlib.colors as colors" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this Notebook, we will use ``Scikit learn`` to classify an ``Sentinel-2`` Image. We will use two different classifiers and compare the results. \n", + "\n", + "## Data Acquisition\n", + "Before we start, we need to load the data. We will use ``ODC Stac`` to get data from Earth Search by Element 84. Here we define the area of interest and the time frame, aswell as the EPSG code and the resolution.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "dx = 0.0006 # 60m resolution\n", + "epsg = 4326\n", + "\n", + "# Spatial extent\n", + "latmin, latmax = 47.86, 48.407\n", + "lonmin, lonmax = 16.32, 16.9\n", + "bounds = (lonmin, latmin, lonmax, latmax)\n", + "minx, miny, maxx, maxy = bounds\n", + "geom = {\n", + " 'type': 'Polygon',\n", + " 'coordinates': [[\n", + " [minx, miny],\n", + " [minx, maxy],\n", + " [maxx, maxy],\n", + " [maxx, miny],\n", + " [minx, miny]\n", + " ]]\n", + "}\n", + "\n", + "# Temporal extent\n", + "year = 2024\n", + "month = 5\n", + "day = 1\n", + "delta = 10\n", "\n", - "from eodag import EODataAccessGateway, SearchResult, setup_logging\n", - "from rasterio.crs import CRS\n", + "start_date = datetime(year, month, day)\n", + "end_date = start_date + timedelta(days=delta)\n", + "date_query = start_date.strftime(\"%Y-%m-%d\") + \"/\" + end_date.strftime(\"%Y-%m-%d\")\n", "\n", - "# Setup logging\n", - "setup_logging(0)" + "# Search for Sentinel-2 data\n", + "items = pystac_client.Client.open(\n", + " \"https://earth-search.aws.element84.com/v1\"\n", + ").search(\n", + " intersects=geom,\n", + " collections=[\"sentinel-2-l2a\"],\n", + " datetime=date_query,\n", + " limit=100,\n", + ").item_collection()\n", + "\n", + "print(len(items), \"scenes found\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have now found a couple of scenes that we can use for our analysis. Now we will load the data directly into an ``xarray`` dataset." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# define a geobox for my region\n", + "geobox = GeoBox.from_bbox(bounds, crs=f\"epsg:{epsg}\", resolution=dx)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# lazily combine items\n", + "ds_odc = odc.stac.load(\n", + " items,\n", + " bands=[\"scl\", \"red\", \"green\", \"blue\", \"nir\"],\n", + " chunks={'time': 5, 'x': 600, 'y': 600},\n", + " geobox=geobox,\n", + " resampling=\"bilinear\",\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# actually load it\n", + "with ProgressBar():\n", + " ds_odc.load()" ], "execution_count": null, "outputs": [] @@ -43,8 +156,304 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Lorum ipsum" + "Next we define a mask to filter out the clouds and the areas with no data and can plot a composite median image of the data." ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# define a mask for valid pixels (non-cloud)\n", + "\n", + "def is_valid_pixel(data):\n", + " # include only vegetated, not_vegitated, water, and snow\n", + " return ((data > 3) & (data < 7)) | (data==11)\n", + "\n", + "ds_odc['valid'] = is_valid_pixel(ds_odc.scl)\n", + "ds_odc.valid.sum(\"time\").plot()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def avg(ds):\n", + " return (ds / ds.max() * 2)\n", + "\n", + "# compute the masked median\n", + "rgb_median = (\n", + " ds_odc[['red', 'green', 'blue']]\n", + " .where(ds_odc.valid)\n", + " .to_dataarray(dim=\"band\")\n", + " .transpose(..., \"band\")\n", + " .median(dim=\"time\")\n", + " .astype(int)\n", + ")\n", + "rgb_comp = avg(rgb_median)\n", + "plot = rgb_comp.plot.imshow(rgb=\"band\", figsize=(8, 8))\n", + "plot.axes.set_title(f\"RGB - Median Composite\\n{start_date.strftime('%d.%m.%Y')} - {end_date.strftime('%d.%m.%Y')}\")\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get an first impression of the data, we can calculate the NDVI (Normalized Difference Vegetation Index) and plot it. This gives us a good overview of the vegetation in the area." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Normalized Difference Vegetation Index (NDVI)\n", + "def normalized_difference(a, b):\n", + " return (a - b*1.) / (a + b)\n", + "\n", + "ndvi = normalized_difference(ds_odc.nir, ds_odc.red)\n", + "ndvi.median(dim=\"time\").plot.imshow(cmap='cmc.cork').axes.set_title('NDVI')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification \n", + "\n", + "### Data Preparation\n", + "In order to start with the actual classification, we need to prepare some data. Since this is a supervised classification, we need to have some training data. We have two ``Geojson`` Files which contain areas that are either forest or not forest. We will use these to train our classifiers. Additionally from the loaded Dataset we will extract the bands that we want to use for the classification.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "forest_path = Path(\"shapefiles/forest.geojson\")\n", + "nonforest_path = Path(\"shapefiles/nonforest.geojson\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ds_class = (\n", + " ds_odc[['red', 'green', 'blue', 'nir']]\n", + " .where(ds_odc.valid)\n", + " .median(dim=\"time\")\n", + ")\n", + "ds_cf = avg(ds_class)\n", + "\n", + "single_img = ds_cf.fillna(0)\n", + "X_train, X_test, y_train, y_test = preprocess_data_to_classify(ds=single_img, feature_path=forest_path, nonfeature_path=nonforest_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Naive Bayes\n", + "nb = GaussianNB()\n", + "nb_test = nb.fit(X_train, y_train)\n", + "nb_predict = nb.predict(X_test)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "print(\"NAIVE BAYES: \\n \"+ classification_report(y_test, nb_predict))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "print(\"NAIVE BAYES: \\n \",confusion_matrix(y_test, nb_predict), \"\\n\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Creating \n", + "bands = []\n", + "for band in [\"red\", \"green\", \"blue\", \"nir\"]:\n", + " bands.append(single_img[band].values)\n", + " \n", + "image_data = np.stack(bands, axis=2)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Reshape the image data\n", + "num_of_pixels = single_img.sizes['longitude'] * single_img.sizes['latitude']\n", + "num_of_bands = len(bands)\n", + "X_image_data = image_data.reshape(num_of_pixels, num_of_bands)\n", + "\n", + "# Predict and reshape the image\n", + "nb_predict_img = nb.predict(X_image_data)\n", + "nb_predict_img = nb_predict_img.reshape(single_img.sizes['latitude'], single_img.sizes['longitude'])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "single_img['NB-forest'] = xr.DataArray(nb_predict_img, dims=['latitude', 'longitude'], coords={'longitude': single_img['longitude'], 'latitude': single_img['latitude']})" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "alpha = 1\t\n", + "cmap_green = colors.ListedColormap([(1, 1, 1, alpha), 'green'])\n", + "\n", + "plot = single_img['NB-forest'].plot.imshow(cmap=cmap_green, cbar_kwargs={'ticks': [0.25,0.75]})\n", + "cbar = plot.colorbar\n", + "cbar.set_ticklabels(['non-forest', 'forest'])\n", + "plot.axes.set_title('Naive Bayes Classification')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "rf = RandomForestClassifier(n_estimators=100)\n", + "rf_test = rf.fit(X_train, y_train)\n", + "rf_predict = rf.predict(X_test)\n", + "\n", + "rf_predict_img = rf.predict(X_image_data)\n", + "rf_predict_img = rf_predict_img.reshape(single_img.sizes['latitude'], single_img.sizes['longitude'])\n", + "\n", + "single_img['RF-forest'] = xr.DataArray(rf_predict_img, dims=['latitude', 'longitude'], coords={'longitude': single_img['longitude'], 'latitude': single_img['latitude']})\n", + "\n", + "plot = single_img['RF-forest'].plot.imshow(cmap=cmap_green, cbar_kwargs={'ticks': [0.25,0.75]})\n", + "cbar = plot.colorbar\n", + "cbar.set_ticklabels(['non-forest', 'forest'])\n", + "plot.axes.set_title('Random Forest Classification')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "print(\"RANDOM FOREST: \\n \"+ classification_report(y_test, rf_predict))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "print(\"RANDOM FOREST: \\n \",confusion_matrix(y_test, rf_predict), \"\\n\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# cmap_trio = colors.ListedColormap(['gainsboro' ,'limegreen', 'mediumseagreen', 'darkgreen'])\n", + "cmap_trio = colors.ListedColormap(['whitesmoke' ,'indianred', 'goldenrod', 'darkgreen'])\n", + "\n", + "\n", + "double_clf = (single_img['NB-forest'] + 2*single_img['RF-forest'])\n", + "\n", + "fig, ax = plt.subplots()\n", + "cax = ax.imshow(double_clf, cmap=cmap_trio, interpolation='none')\n", + "\n", + "# Add a colorbar with custom tick labels\n", + "cbar = fig.colorbar(cax, ticks=[1*0.375, 3*0.375, 5*0.375, 7*0.375])\n", + "cbar.ax.set_yticklabels(['None', 'Naive Bayes', 'Random Forest', 'Both'])\n", + "ax.set_title('Classification Comparisson')\n", + "ax.set_axis_off()\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Plot only one class, either None (0), Naive Bayes (1), Random Forest (2), or Both (3)\n", + "fig, axs = plt.subplots(2,2, figsize=(8,8))\n", + "ax = axs.ravel()\n", + "\n", + "for i in range(4):\n", + " ax[i].imshow(double_clf==i, cmap='cmc.oleron_r', interpolation='none')\n", + " category = ['by None', 'only by Naive Bayes', 'only by Random Forest', 'by Both'][i]\n", + " title = 'Areas classified ' + category\n", + " ax[i].set_title(title)\n", + " ax[i].set_axis_off()\n", + "\n", + "plt.tight_layout()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "counts = {}\n", + "for num in range(1,4): # Change the start of the range to 0 if you want to include the 'None' class\n", + " num_2_class = {0: 'None', 1: 'Naive Bayes', 2: 'Random Forest', 3: 'Both'}\n", + " counts[num_2_class[num]] = int((double_clf == num).sum().values)\n", + "\n", + "class_counts_df = pd.DataFrame(list(counts.items()), columns=['Class', 'Count'])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "class_counts_df['Percentage'] = (class_counts_df['Count'] / class_counts_df['Count'].sum())*100\n", + "ax = class_counts_df.plot.bar(x='Class', y='Percentage', rot=0, color='darkgreen', ylim=(0,100), title='Classified Areas per Classificator (%)')\n", + "\n", + "# Annotate the bars with the percentage values\n", + "for p in ax.patches:\n", + " ax.annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), \n", + " ha='center', va='center', xytext=(0, 9), textcoords='offset points')" + ], + "execution_count": null, + "outputs": [] } ], "metadata": { diff --git a/notebooks/01_classification.yml b/notebooks/01_classification.yml index 92fc6fe..53134d5 100644 --- a/notebooks/01_classification.yml +++ b/notebooks/01_classification.yml @@ -1,15 +1,21 @@ name: 01_classification channels: - conda-forge + - defaults dependencies: - python=3.10 - pip - mamba - jupyter - xarray - - cmcrameri - - python-dotenv + - stackstac + - odc-stac + - dask - rasterio - - folium + - cmcrameri + - numpy + - matplotlib + - seaborn + - scikit-learn - pip: - - eodag \ No newline at end of file + - pystac-client diff --git a/notebooks/02_floodmapping.yml b/notebooks/02_floodmapping.yml index a06c192..c0aa697 100644 --- a/notebooks/02_floodmapping.yml +++ b/notebooks/02_floodmapping.yml @@ -12,7 +12,6 @@ dependencies: - xarray - zarr - netcdf4 - - ipykernel - ipympl - ipywidgets - datashader \ No newline at end of file