-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #135 from IFRCGo/feature/global_flood_database
GFD Extraction Transformation.
- Loading branch information
Showing
18 changed files
with
530 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from datetime import datetime, timedelta | ||
|
||
from celery import chain, shared_task | ||
|
||
from apps.etl.extraction.sources.gfd.extract import GFDExtraction | ||
from apps.etl.transform.sources.gfd import GFDTransformHandler | ||
|
||
|
||
@shared_task | ||
def ext_and_transform_gfd_historical_data(): | ||
chain( | ||
GFDExtraction.task.s(), | ||
GFDTransformHandler.task.s(), | ||
).apply_async() | ||
|
||
|
||
@shared_task | ||
def ext_and_transform_gfd_latest_data(): | ||
end_date = datetime.now().date() | ||
start_date = end_date - timedelta(days=1) | ||
|
||
chain( | ||
GFDExtraction.task.s(start_date, end_date), | ||
GFDTransformHandler.task.s(), | ||
).apply_async() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
import base64 | ||
import hashlib | ||
import json | ||
import logging | ||
import tempfile | ||
from typing import Any, Callable | ||
|
||
import ee | ||
import requests | ||
from django.conf import settings | ||
|
||
from apps.etl.extraction.sources.base.handler import BaseExtraction | ||
from apps.etl.extraction.sources.base.utils import manage_duplicate_file_content | ||
from apps.etl.models import ExtractionData | ||
from main.celery import app | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
DATA_URL = "https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/assets/GLOBAL_FLOOD_DB/MODIS_EVENTS/V1" | ||
|
||
|
||
class GFDExtraction(BaseExtraction): | ||
|
||
@classmethod | ||
def decode_json(cls, encoded_str): | ||
"""Decodes a Base64 string back to a JSON object.""" | ||
decoded_data = base64.urlsafe_b64decode(encoded_str.encode()).decode() | ||
return json.loads(decoded_data) | ||
|
||
@classmethod | ||
def get_json_credentials(cls, content): | ||
with tempfile.NamedTemporaryFile(delete=False, mode="w") as temp_file: | ||
json_string = json.dumps(content, sort_keys=True) | ||
temp_file.write(json_string) | ||
temp_path = temp_file.name | ||
return temp_path | ||
|
||
@classmethod | ||
def hash_json_content(cls, json_data): | ||
"""Hashes a JSON object using SHA256.""" | ||
json_string = json.dumps(json_data, sort_keys=True) | ||
return hashlib.sha256(json_string.encode()).hexdigest() | ||
|
||
@classmethod | ||
def store_extraction_data( | ||
cls, | ||
validate_source_func: Callable[[Any], None], | ||
source: int, | ||
response: dict, | ||
instance_id: int = None, | ||
): | ||
""" | ||
Save extracted data into database. Checks for duplicate content using hashing. | ||
""" | ||
file_extension = "json" | ||
file_name = f"{source}.{file_extension}" | ||
resp_data_content = json.dumps(response) | ||
|
||
# save the additional response data after the data is fetched from api. | ||
extraction_instance = ExtractionData.objects.get(id=instance_id) | ||
extraction_instance.resp_data_type = "application/json" | ||
extraction_instance.save(update_fields=["resp_data_type"]) | ||
|
||
# Validate the non empty response data. | ||
if resp_data_content: | ||
# Source validation | ||
if validate_source_func: | ||
extraction_instance.source_validation_status = validate_source_func(resp_data_content)["status"] | ||
extraction_instance.content_validation = validate_source_func(resp_data_content)["validation_error"] | ||
|
||
# manage duplicate file content. | ||
hash_content = cls.hash_json_content(resp_data_content) | ||
manage_duplicate_file_content( | ||
source=extraction_instance.source, | ||
hash_content=hash_content, | ||
instance=extraction_instance, | ||
response_data=resp_data_content, | ||
file_name=file_name, | ||
) | ||
return extraction_instance | ||
|
||
@classmethod | ||
def _save_response_data(cls, instance: ExtractionData, response: requests.Response) -> dict: | ||
""" | ||
Save the response data to the extraction instance. | ||
Args: | ||
instance: ExtractionData instance to save to | ||
response: Response object containing the data | ||
Returns: | ||
dict: Parsed JSON response content | ||
""" | ||
instance = cls.store_extraction_data( | ||
response=response, | ||
source=instance.source, | ||
validate_source_func=None, | ||
instance_id=instance.id, | ||
) | ||
|
||
return response | ||
|
||
@classmethod | ||
def get_flood_data(cls, collection, batch_size=1000): | ||
"""Retrieve flood metadata in batches to avoid memory issues.""" | ||
total_size = collection.size().getInfo() | ||
|
||
all_data = [] | ||
for i in range(0, total_size, batch_size): | ||
batch = collection.toList(batch_size, i).getInfo() | ||
all_data.extend([feature for feature in batch]) | ||
|
||
return all_data | ||
|
||
@classmethod | ||
def handle_extraction(cls, url: str, source: int, start_date, end_date) -> int: | ||
""" | ||
Process data extraction. | ||
Returns: | ||
int: ID of the extraction instance | ||
""" | ||
logger.info("Starting data extraction") | ||
instance = cls._create_extraction_instance(url=url, source=source) | ||
|
||
try: | ||
cls._update_instance_status(instance, ExtractionData.Status.IN_PROGRESS) | ||
response = cls.extract_data(start_date, end_date) | ||
response_data = cls._save_response_data(instance, response) | ||
# Check if response contains data | ||
if response_data: | ||
cls._update_instance_status(instance, ExtractionData.Status.SUCCESS) | ||
logger.info("Data extracted successfully") | ||
else: | ||
cls._update_instance_status( | ||
instance, | ||
ExtractionData.Status.SUCCESS, | ||
ExtractionData.ValidationStatus.NO_DATA, | ||
update_validation=True, | ||
) | ||
logger.warning("No hazard data found in response") | ||
|
||
return instance.id | ||
|
||
except requests.exceptions.RequestException: | ||
cls._update_instance_status(instance, ExtractionData.Status.FAILED) | ||
logger.error( | ||
"extraction failed", | ||
exc_info=True, | ||
extra={ | ||
"source": instance.source, | ||
}, | ||
) | ||
raise | ||
|
||
@classmethod | ||
def extract_data(cls, start_date=None, end_date=None): | ||
# Set up authentication | ||
service_account = settings.GFD_SERVICE_ACCOUNT | ||
|
||
# # Decode the earthengine credential | ||
decoded_json = cls.decode_json(settings.GFD_CREDENTIAL) | ||
credential_file_path = cls.get_json_credentials(decoded_json) | ||
|
||
# Authenticate | ||
credentials = ee.ServiceAccountCredentials(service_account, credential_file_path) | ||
ee.Initialize(credentials) | ||
|
||
# Load Global Flood Database (GFD) | ||
gfd_data = ee.ImageCollection("GLOBAL_FLOOD_DB/MODIS_EVENTS/V1") | ||
|
||
# Filter flood events by date | ||
if start_date and end_date: | ||
gfd_data = gfd_data.filterDate(str(start_date), str(end_date)) | ||
|
||
flood_data = cls.get_flood_data(gfd_data, batch_size=500) | ||
return flood_data | ||
|
||
@staticmethod | ||
@app.task | ||
def task(start_date=None, end_date=None): | ||
return GFDExtraction().handle_extraction(DATA_URL, ExtractionData.Source.GIDD, start_date, end_date) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from django.core.management.base import BaseCommand | ||
|
||
from apps.etl.etl_tasks.gfd import ext_and_transform_gfd_historical_data | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Import data from gfd api" | ||
|
||
def handle(self, *args, **options): | ||
ext_and_transform_gfd_historical_data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from pystac_monty.sources.gfd import GFDDataSource, GFDTransformer | ||
|
||
from apps.etl.models import ExtractionData | ||
from apps.etl.transform.sources.handler import BaseTransformerHandler | ||
from main.celery import app | ||
|
||
|
||
class GFDTransformHandler(BaseTransformerHandler): | ||
transformer = GFDTransformer | ||
transformer_schema = GFDDataSource | ||
|
||
@classmethod | ||
def get_schema_data(cls, extraction_obj: ExtractionData): | ||
with extraction_obj.resp_data.open() as file_data: | ||
data = file_data.read() | ||
|
||
return cls.transformer_schema(source_url=extraction_obj.url, data=data) | ||
|
||
@staticmethod | ||
@app.task | ||
def task(extraction_id): | ||
return GFDTransformHandler().handle_transformation(extraction_id) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule pystac-monty
updated
3 files
+204 −0 | pystac_monty/sources/gfd.py | |
+874 −0 | tests/extensions/cassettes/test_gfd/GFDTest.test_transformer_0.yaml | |
+156 −0 | tests/extensions/test_gfd.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ dependencies = [ | |
"sentry-sdk", | ||
"ipython", | ||
"uwsgi", | ||
"earthengine-api>=1.5.1", | ||
] | ||
|
||
[tool.uv.sources] | ||
|
Oops, something went wrong.