On this page

Create chips from a temporal set of Sentinel 2 snapshotsΒΆ

  • create chip bounds augmented by sliding

  • sample S2

    • loop over tiles

      • get bounds by sjoin

      • loop over times

        • loop over bounds (bag)

          • check cloud probability/cloud mask

            • sample a chip

            • check for missing values

            • save a chip

# this cell does not work in the gds_env:7.0 and requires newer versions of GDAL and pyogrio. Works with GDAL 3.4.1, pyogrio 0.3.0.
# import pyogrio

# tiles_of_interest = [
#     '29UQR', '30UUA', '29UQS', '30UWA', '30UVA', '30UXB', '30UWB',
#     '30UVB', '30UXC', '30UWC', '30UVC', '30UXD', '30UWD', '30UUB',
#     '29UQT', '30UUC', '29UQU', '30UUD', '30UVD', '30UXE', '30UWE',
#     '30UVE', '30UXF', '30UWF', '30UVF', '30UXG', '30UWG', '30UVG',
#     '29UQV', '30UUE', '30UUF', '30UUG', '30VWH', '30VVH', '29VPC',
#     '30VUH', '30UYB', '31UCS', '30UYC', '31UCT', '31UDT', '30UYD',
#     '31UCU', '31UDU', '30UYE', '31UCV'
# ]

# tile_geometry = pyogrio.read_dataframe("https://sentinels.copernicus.eu/documents/247904/1955685/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml")
# tile_geometry = tile_geometry[tile_geometry.Name.isin(tiles_of_interest)].explode(index_parts=False)
# tile_geometry[tile_geometry.geom_type == "Polygon"].to_file("../chips_gb/sentinel_tiles.gpkg")
import os
import pyogrio
import geopandas
import glob
import math
import rasterio
import rioxarray
import matplotlib.pyplot as plt
import pandas
from pathlib import Path
from dask.distributed import Client, LocalCluster, as_completed
client = Client(LocalCluster(n_workers=16))
client

Client

Client-42df0a96-90dd-11ec-b8ad-33d6c79f778e

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

specs = {
    'chip_size': 32,
    'folder': (
        '/home/jovyan/work/chips_gb/32_temporal/'
    ),
}
s = specs["chip_size"]
tiles = pyogrio.read_dataframe("../chips_gb/sentinel_tiles.gpkg").to_crs(27700)
def process_tile(row):
    epsg = pandas.read_html(row.description)[0].loc[1, 1]
    bounds_in = bounds.iloc[bounds.sindex.query(row.geometry, predicate="contains")].to_crs(int(epsg))
    times = glob.glob(f"../../data/Sentinel2/{row.Name}/*")
    for t in times:
        try:
            cloud_proba = glob.glob(f"{t}/*CLDPRB*")[0]
            tci = glob.glob(f"{t}/*TCI*")[0]
            for tup in bounds_in.itertuples():
                with rasterio.open(cloud_proba) as f:
                    cldprb, transform = rasterio.mask.mask(
                                f, [tup.geometry], crop=True, all_touched=True
                            )
                if (cldprb > 10).sum() < 10:
                    with rasterio.open(tci) as src:
                        profile = src.profile
                        profile.update(
                            width=s,
                            height=s,
                            driver="GTiff",
                            dtype=rasterio.uint8,
                            tiled=False
                        )
                        try:
                            img, transform = rasterio.mask.mask(
                                src, [tup.geometry], crop=True, all_touched=True
                            )
                            _, w, h = img.shape
                            rw = (w - s) / 2
                            rh = (h - s) / 2
                            img = img[:, math.floor(rw):-math.ceil(rw), math.floor(rh):-math.ceil(rh)]
                            if ((img[0] == img[1]) & (img[1] == img[2])).sum() < 3:  # filter missingness

                                path = f"{specs['folder']}{tup.signature_type}/{tup.X}_{tup.Y}_{Path(t).stem}.tif"
                                with rasterio.open(path, 'w', **profile) as dst:
                                    dst.write(img.astype(rasterio.uint8))
                        except ValueError:
                            pass
        except IndexError:
            pass
    return f"Tile {row.Name} processed sucessfully." 
subsets = ["train", "validation", "secret"]
subsets = ["secret"]

for sub in subsets:
    specs['folder'] = f'/home/jovyan/work/chips_gb/32_temporal/{sub}/'
    
    bounds = geopandas.read_parquet(f"../chips_gb/slided_{sub}_50k.pq")
    centroid = bounds.centroid
    bounds['X'] = centroid.x.astype(int)
    bounds['Y'] = centroid.y.astype(int)
    
    for t in bounds.signature_type.unique():
        os.makedirs(f"{specs['folder']}{t}", exist_ok=True)
    
    inputs = tiles.itertuples()
    futures = [client.submit(process_tile, next(inputs)) for i in range(16)]
    ac = as_completed(futures)
    for finished_future in ac:
        # submit new future 
        try:
            new_future = client.submit(process_tile, next(inputs))
            ac.add(new_future)
        except StopIteration:
            pass
        print(finished_future.result())
    
Tile 29VPC processed sucessfully.
Tile 29UQU processed sucessfully.
Tile 29UQV processed sucessfully.
Tile 30UUE processed sucessfully.
Tile 30UUD processed sucessfully.
Tile 30UWA processed sucessfully.
Tile 29UQS processed sucessfully.
Tile 30UUF processed sucessfully.
Tile 29UQR processed sucessfully.
Tile 30UUB processed sucessfully.
Tile 30UUA processed sucessfully.
Tile 29UQT processed sucessfully.
Tile 30UUC processed sucessfully.
Tile 30UVA processed sucessfully.
Tile 30UVD processed sucessfully.
Tile 30UUG processed sucessfully.
Tile 30UVF processed sucessfully.
Tile 30UXG processed sucessfully.
Tile 30UVE processed sucessfully.
Tile 30UVB processed sucessfully.
Tile 30UWB processed sucessfully.
Tile 30UYE processed sucessfully.
Tile 30UWC processed sucessfully.
Tile 30VUH processed sucessfully.
Tile 30VWH processed sucessfully.
Tile 30UYB processed sucessfully.
Tile 30UXF processed sucessfully.
Tile 30UYD processed sucessfully.
Tile 31UCV processed sucessfully.
Tile 31UDT processed sucessfully.
Tile 30VVH processed sucessfully.
Tile 31UDU processed sucessfully.
Tile 30UVG processed sucessfully.
Tile 31UCS processed sucessfully.
Tile 30UWG processed sucessfully.
Tile 31UCU processed sucessfully.
Tile 30UXE processed sucessfully.
Tile 30UVC processed sucessfully.
Tile 30UWF processed sucessfully.
Tile 30UXB processed sucessfully.
client.restart()
distributed.nanny - WARNING - Worker process still alive after 1.5999980926513673 seconds, killing

Client

Client-42df0a96-90dd-11ec-b8ad-33d6c79f778e

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info