Code
import pandas as pd
import numpy as np
# Cargar datos desde la carpeta data/
= pd.read_csv("../data/supermarket_sales.csv")
df
# Estandarizar formatos
'Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Product line'] = df['Product line'].str.title()
df[
# Manejo de valores faltantes
= df.dropna(subset=['Unit price', 'Quantity'])
df
# Corrección de errores
= df[df['Quantity'] > 0] # Eliminar cantidades negativas o cero
df
# Nuevas columnas
'Day of Week'] = df['Date'].dt.day_name()
df['Month'] = df['Date'].dt.month_name()
df[
# Guardar datos limpios (opcional)
# df.to_csv("../data/supermarket_sales_clean.csv", index=False)
# Resumen inicial
print(df.head())
Invoice ID Branch City Customer type Gender \
0 750-67-8428 A Yangon Member Female
1 226-31-3081 C Naypyitaw Normal Female
2 631-41-3108 A Yangon Normal Male
3 123-19-1176 A Yangon Member Male
4 373-73-7910 A Yangon Normal Male
Product line Unit price Quantity Tax 5% Total Date \
0 Health And Beauty 74.69 7 26.1415 548.9715 2019-01-05
1 Electronic Accessories 15.28 5 3.8200 80.2200 2019-03-08
2 Home And Lifestyle 46.33 7 16.2155 340.5255 2019-03-03
3 Health And Beauty 58.22 8 23.2880 489.0480 2019-01-27
4 Sports And Travel 86.31 7 30.2085 634.3785 2019-02-08
Time Payment Cost of goods sold Gross margin percentage \
0 13:08 Ewallet 522.83 4.761905
1 10:29 Cash 76.40 4.761905
2 13:23 Credit card 324.31 4.761905
3 20:33 Ewallet 465.76 4.761905
4 10:37 Ewallet 604.17 4.761905
Gross income Customer stratification rating Day of Week Month
0 26.1415 9.1 Saturday January
1 3.8200 9.6 Friday March
2 16.2155 7.4 Sunday March
3 23.2880 8.4 Sunday January
4 30.2085 5.3 Friday February