In [1]:
# Import packages
import requests
import pandas as pd
import io
import boto3
from urllib.parse import urlparse
from sagemaker_studio import Project

proj = Project()

In [2]:
url = "https://aws-blogs-artifacts-public.s3.us-east-1.amazonaws.com/artifacts/BDB-5066/synthetic_nyc_weather_hourly_2016.csv"
response = requests.get(url)
data = response.text

w_df = pd.read_csv(io.StringIO(data.strip()))


print('NYC Weather Data Sample \n\n')
w_df.head()

NYC Weather Data Sample 




Unnamed: 0,"<?xml version=""1.0"" encoding=""UTF-8""?>"
0,<Error><Code>AccessDenied</Code><Message>Acces...


In [10]:
# rename columns
w_df = w_df.rename(columns={
    w_df.columns[0]: 'timestamp',
    w_df.columns[3]: 'temp_c',
    w_df.columns[5]: 'rain_mm',
    w_df.columns[10]: 'windspeed'
})

# convert rain mm to rain inches
w_df['rain_inches'] = w_df['rain_mm'].apply(lambda x: x * 25.4)
w_df['temp_f'] = w_df['temp_c'].apply(lambda x: (x * 9/5) + 32)
w_df['timestamp'] = pd.to_datetime(w_df['timestamp'])
w_df['rounded_hour'] = w_df['timestamp'].dt.round('h')

# Keep only desired columns
w_df = w_df[['timestamp', 'temp_f', 'rain_inches', 'windspeed', 'rounded_hour']]


print('NYC Weather Data Sample \n\n')
w_df.head()

NYC Weather Data Sample 




Unnamed: 0,timestamp,temp_f,rain_inches,windspeed,rounded_hour
0,2016-01-01 00:00:00,28.274,0.0,15.4,2016-01-01 00:00:00
1,2016-01-01 01:00:00,27.842,0.0,14.87,2016-01-01 01:00:00
2,2016-01-01 02:00:00,27.428,0.0,14.35,2016-01-01 02:00:00
3,2016-01-01 03:00:00,26.798,0.0,13.82,2016-01-01 03:00:00
4,2016-01-01 04:00:00,26.384,0.0,13.29,2016-01-01 04:00:00


In [11]:
# Path to where data will be stored
file_name = 'weather_data.csv'
s3_folder = '/datalake/'
data_path = proj.s3.root + s3_folder + file_name


# Create S3 client
s3_client = boto3.client('s3')


# Function to write the results to S3
def put_dataframe_to_s3_as_csv(s3_uri, dataframe):
    # Convert DataFrame to CSV string
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, index=False)

    # Parse the S3 URI
    parsed = urlparse(s3_uri)
    bucket = parsed.netloc
    key = parsed.path.lstrip('/')

    # Upload to S3
    s3_client.put_object(
        Bucket=bucket,
        Key=key,
        Body=csv_buffer.getvalue(),
        ContentType='text/csv'
    )

In [12]:
# Write result to S3
try:
    put_dataframe_to_s3_as_csv(data_path, w_df)
    print("DataFrame successfully uploaded as CSV to S3")
except Exception as e:
    print(f"Error uploading DataFrame to S3: {str(e)}")

DataFrame successfully uploaded as CSV to S3
