In [1]:
import pandas as pd
import csv
import numpy as np
In [2]:
dfRecipes = pd.read_csv('csv/fit.recepty.csv')
dfProducts = pd.read_csv('csv/products.csv')
dfCategories = pd.read_csv('csv/out_categories.csv')
In [3]:
dfProducts.rename(columns={'primary_category.id': 'cat_id'}, inplace=True)
dfCategories.rename(columns={'category_id' : 'cat_id'}, inplace=True)
In [4]:
#tu sa pozrieme na datove typy s akymi pracujeme
dfProducts.dtypes
Out[4]:
badge                         object
brand_name                    object
categories                    object
country_of_origin             object
crc32                          int64
description                   object
id                             int64
image                         object
image_large                   object
ingredients                   object
name                          object
new_name                      object
popularity                   float64
price_amount                  object
price_currency                object
price_for_bottle.amount      float64
price_for_bottle.currency     object
price_for_unit_amount         object
price_for_unit_currency       object
cat_id                       float64
unit                          object
weight                        object
weight_variations               bool
dtype: object
In [5]:
dfCategories.dtypes
Out[5]:
cat_id            int64
category_path    object
dtype: object
In [6]:
#vypis tabulky, kde vidime ze vo vstupnych datach je price_amount s , a nie . preto to nejde pretypovat
dfProducts.head()
Out[6]:
badge brand_name categories country_of_origin crc32 description id image image_large ingredients ... price_amount price_currency price_for_bottle.amount price_for_bottle.currency price_for_unit_amount price_for_unit_currency cat_id unit weight weight_variations
0 NaN Nivea [{'id': 300109000}, {'id': 300109103}, {'id': ... dle aktuální nabídky 4119786657 Nivea Sprej antiperspirant Silver Protect Dyna... 1296725 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Butane\nIosbutane\nPropane\nCyclomethicone\nAl... ... 64,9 CZK NaN NaN 432,67 CZK 300109110.0 l NaN False
1 NaN Nivea [{'id': 300109000}, {'id': 300109103}, {'id': ... dle aktuální nabídky 3851673989 Předchází vzniku bílých stop na černém oblečen... 1296731 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Stearyl alcohol\nPPG-14 butyl ether\nCyclometh... ... 64,9 CZK NaN NaN 1622,5 CZK NaN kg NaN False
2 NaN Nivea [{'id': 300109000}, {'id': 300109103}, {'id': ... dle aktuální nabídky 3818404418 48 h ochrana\nBez alkoholu a barviv\nDermatolo... 1296737 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Butane\nIsobutane\nPropane\nAluminum chlorohyd... ... 64,9 CZK NaN NaN 432,67 CZK 300109105.0 l NaN False
3 NaN Nivea [{'id': 300109000}, {'id': 300109127}, {'id': ... dle aktuální nabídky 3698401885 Nivea® Intimo Sprchová emulze Sensitive je spe... 1296749 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Aqua\nCocamidopropyl betaine\nSodium laureth s... ... 99,9 CZK NaN NaN 399,6 CZK NaN l NaN False
4 NaN Nivea [{'id': 300109000}, {'id': 300109060}, {'id': ... dle aktuální nabídky 2384072835 Krémové tekuté mýdlo na ruce s jedinečnou vůní... 1296751 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Aqua\nCocamidopropyl betaine\nSodium laureth s... ... 49,9 CZK NaN NaN 199,6 CZK NaN l NaN False

5 rows × 23 columns

In [7]:
#vymena , za . cez replace a rovno pretypovanie na float / desatine cislo
dfProducts['price_amount'] = dfProducts['price_amount'].str.replace(',','.').astype(np.float64)
In [8]:
#uz je cena ako float
dfProducts.dtypes
Out[8]:
badge                         object
brand_name                    object
categories                    object
country_of_origin             object
crc32                          int64
description                   object
id                             int64
image                         object
image_large                   object
ingredients                   object
name                          object
new_name                      object
popularity                   float64
price_amount                 float64
price_currency                object
price_for_bottle.amount      float64
price_for_bottle.currency     object
price_for_unit_amount         object
price_for_unit_currency       object
cat_id                       float64
unit                          object
weight                        object
weight_variations               bool
dtype: object

Ulohy z SQL do Pandasu

1.) Vypis z datasetu dfProducts vsetky produkty, ktore su drahsie v atribute price_amount ako 70 Kc.
2.) Vypis prvych 10 najlacnejsich produktov. Pomocka na zotredenie je mozne pouzit napriklad sort_values()
3.) Vypis vsetky produkty ktore su brandu Nescafé Dolce Gusto a maju zaroven popularity vetsiu ako -11 . Pomocka na viacero podmienok sa pouziva zapis df[(df['size'] >= 5) & (df['total_bill'] > 45)]
4.) Vypis vsetky produkty ktore su z krajiny Dánsko alebo Francie. Pomocka - alebo sa znaci |
5.) Vypis vsetky produkty, ktore su v kategoriach 300109132, 300109052, 300101045 . Pomocka - Na IN sa pouziva df.isin().
6.) Zgrup produkty podla zeme a zrataj pocet produktov pre vybranu zem. Pomocka: GROUP BY a COUNT sa robia df.groupby('sex').size()
7.) Zrataj a zgrup priemernu popularitu vsetkych produktov podla krajiny, pre Cesko
8.) Aktualizuj vsetky hodnoty, v stlpci popularity, kde popularity je zaporne cislo na jeho absolutnu hodnotu. Aktualizacia vsetkych cisel > 2 o jeho mocninu, sa robi napriklad takto tips.loc[tips['tip'] < 2, 'tip'] *= 2

Uloha1 - SQL

SELECT *
FROM "dfProducts"
WHERE "price_amount" > 70;

In [9]:
u1 = dfProducts[dfProducts["price_amount"] > 70.0]
u1.head()
Out[9]:
badge brand_name categories country_of_origin crc32 description id image image_large ingredients ... price_amount price_currency price_for_bottle.amount price_for_bottle.currency price_for_unit_amount price_for_unit_currency cat_id unit weight weight_variations
3 NaN Nivea [{'id': 300109000}, {'id': 300109127}, {'id': ... dle aktuální nabídky 3698401885 Nivea® Intimo Sprchová emulze Sensitive je spe... 1296749 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Aqua\nCocamidopropyl betaine\nSodium laureth s... ... 99.9 CZK NaN NaN 399,6 CZK NaN l NaN False
5 NaN Parodontax [{'id': 300109000}, {'id': 300109051}, {'id': ... dle aktuální nabídky 1117749507 Zubní kartáček parodontax je speciálně navržen... 1296949 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... NaN ... 184.9 CZK NaN NaN 61,63 CZK 300109053.0 ks NaN False
11 NaN Nescafé Dolce Gusto [{'id': 300108000}, {'id': 300108001}, {'id': ... dle aktuální nabídky 961017301 Rozpustná směs pro přípravu kakaového nápoje s... 1296973 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... NaN ... 149.9 CZK NaN NaN 9,37 CZK 300108005.0 ks NaN False
12 NaN Nescafé Dolce Gusto [{'id': 300108000}, {'id': 300108001}, {'id': ... dle aktuální nabídky 2884065199 Pražená mletá káva a sušené mléko\nNapěněné ml... 1296975 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... NaN ... 149.9 CZK NaN NaN 9,37 CZK 300108005.0 ks NaN False
13 NaN Nescafé Dolce Gusto [{'id': 300108000}, {'id': 300108001}, {'id': ... dle aktuální nabídky 2586908672 Lahodná káva pro každý den. Volba milovníků si... 1296977 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... NaN ... 149.9 CZK NaN NaN 9,37 CZK 300108005.0 ks NaN False

5 rows × 23 columns

Uloha 2

Vypis prvych 10 najlacnejsich produktov. Pomocka na zotredenie je mozne pouzit napriklad sort_values()

SQL riesenie


SELECT *
FROM "dfProducts"
ORDER BY "price_amount" ASC
LIMIT 10;

In [10]:
u2 = dfProducts.sort_values("price_amount").head(11)
u2.head()
Out[10]:
badge brand_name categories country_of_origin crc32 description id image image_large ingredients ... price_amount price_currency price_for_bottle.amount price_for_bottle.currency price_for_unit_amount price_for_unit_currency cat_id unit weight weight_variations
7388 https://1564005101.rsc.cdn77.org/images/icons/... NaN [{'id': 300108000}, {'id': 300108031}, {'id': ... Itálie 428769016 Ideální pro přípravu koktejlů, dochucení salát... 1320793 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: Limetková šťáva z koncentrátu (99,97%... ... 1.0 CZK NaN NaN 5 CZK 300108032.0 l NaN False
7375 https://1564005101.rsc.cdn77.org/images/icons/... NaN [{'id': 300106000}, {'id': 300106031}, {'id': ... dle aktuální nabídky 153902127 Chuť přírody!\nBez konzervantů\nBez přidaných ... 1320733 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Koncentrovaný hovězí bujón (voda, hovězí extra... ... 1.0 CZK NaN NaN 17,86 CZK 300106040.0 kg NaN False
7585 NaN NaN [{'id': 75403}, {'id': 75455}, {'id': 133897},... Česká republika 2893385919 čerstvé pečivo pšeničné ve tvaru rohlíku\n\nsk... 1286399 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... pšeničná mouka 62%, pitná voda 31%, pekařské d... ... 1.5 CZK NaN NaN 34,88 CZK 300101013.0 kg NaN False
7474 NaN NaN [{'id': 75403}, {'id': 75455}, {'id': 133897},... Česká republika 491813887 Tradiční křupavé pečivo z Karlovy pekárny.\n\n... 1296509 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: Pšeničná mouka, voda, droždí, řepkový... ... 1.5 CZK NaN NaN 34,88 CZK 300101013.0 kg NaN False
7589 NaN NaN [{'id': 75403}, {'id': 75455}, {'id': 133897},... Česká republika 1829686149 Čerstvé pečivo pšeničné ve tvaru ražené housky... 1286405 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... pšeničná mouka 62%, pitná voda 31%, pekařské d... ... 1.9 CZK NaN NaN 38 CZK 300101014.0 kg NaN False

5 rows × 23 columns

Uloha 3

Vypis vsetky produkty ktore su brandu Nescafé Dolce Gusto a maju zaroven popularity vetsiu ako -11.


SQL riesenie

SELECT *
FROM "dfProducts"
WHERE "brand_name" = "Nescafé Dolce Gusto" AND "popularity" > -11;

In [11]:
u3 = dfProducts[(dfProducts["brand_name"] == "Nescafé Dolce Gusto") & (dfProducts["popularity"] > -11)]
u3.head()
Out[11]:
badge brand_name categories country_of_origin crc32 description id image image_large ingredients ... price_amount price_currency price_for_bottle.amount price_for_bottle.currency price_for_unit_amount price_for_unit_currency cat_id unit weight weight_variations
11 NaN Nescafé Dolce Gusto [{'id': 300108000}, {'id': 300108001}, {'id': ... dle aktuální nabídky 961017301 Rozpustná směs pro přípravu kakaového nápoje s... 1296973 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... NaN ... 149.9 CZK NaN NaN 9,37 CZK 300108005.0 ks NaN False

1 rows × 23 columns

Uloha 4

Vypis vsetky produkty ktore su z krajiny Dánsko alebo Francie. Pomocka alebo sa znaci |.

SQL riesenie

SELECT *
FROM "dfProducts"
WHERE "country_of_origin" = "Dánsko" OR "country_of_origin" = "Francie";

In [12]:
u4 = dfProducts[(dfProducts["country_of_origin"] == "Dánsko") | (dfProducts["country_of_origin"] == "Francie")]
u4.head()
Out[12]:
badge brand_name categories country_of_origin crc32 description id image image_large ingredients ... price_amount price_currency price_for_bottle.amount price_for_bottle.currency price_for_unit_amount price_for_unit_currency cat_id unit weight weight_variations
17 NaN Tulip [{'id': 300106000}, {'id': 300106068}, {'id': ... Dánsko 2375436625 Jemná paštika z vepřových jater. \n\nUpozorňuj... 1296989 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: vepřová játra (46%), vepřové sádlo, p... ... 29.90 CZK NaN NaN 239,2 CZK 300106081.0 kg NaN False
21 NaN Tulip [{'id': 300106000}, {'id': 300106068}, {'id': ... Dánsko 3612201864 Jemné "hotdogové" párky. \n\nUpozorňujeme, že ... 1297001 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: strojně oddělené maso vepřové (36% hm... ... 54.90 CZK NaN NaN 132,29 CZK 300106079.0 kg NaN False
23 NaN Alpro [{'id': 133331}, {'id': 133549}, {'id': 133553... Francie 3073776232 Alpro sójový nápoj original plný výborných, vy... 1297005 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: pitná voda, loupané sójové boby (5,9%... ... 59.90 CZK NaN NaN 59,9 CZK 300105066.0 l NaN False
201 NaN NaN [{'id': 300102000}, {'id': 300102008}, {'id': ... Francie 4210004502 Dýně hokaido má jemnou, sladkou trochu oříškov... 1297579 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... NaN ... 66.58 CZK NaN NaN 79,9 CZK 300102011.0 kg 1,2 True
310 NaN NaN [{'id': 300106000}, {'id': 300106096}, {'id': ... Francie 1258778530 Panzani vybírá tu nejkvalitnější tvrdou pšenic... 1297911 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: 100% krupice z tvrdozrnné pšenice\n\n... ... 44.90 CZK NaN NaN 89,8 CZK 300106103.0 kg NaN False

5 rows × 23 columns

Uloha 5

Vypis vsetky produkty, ktore su v kategoriach 300109132, 300109052, 300101045 . Pomocka - Na IN sa pouziva df.isin().

SQL riesenie

SELECT *
FROM "dfProducts"
WHERE "cat_id" IN (300109132, 300109052, 300101045)

In [13]:
u5 = dfProducts[dfProducts["cat_id"].isin([300109132, 300109052, 300101045])]
u5.head()
Out[13]:
badge brand_name categories country_of_origin crc32 description id image image_large ingredients ... price_amount price_currency price_for_bottle.amount price_for_bottle.currency price_for_unit_amount price_for_unit_currency cat_id unit weight weight_variations
6 NaN Odol [{'id': 75685}, {'id': 75689}, {'id': 134377},... dle aktuální nabídky 2931252705 Trojbarevná zubní pasta Odol3 malé zoubky byla... 1296951 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Aqua\nHydrated silica\nSorbitol\nGlycerin\nXan... ... 49.9 CZK NaN NaN 998 CZK 300109052.0 l NaN False
10 NaN Dobré Pečivo [{'id': 300101000}, {'id': 300101043}, {'id': ... Česká republika 1652126142 \n\nUpozorňujeme, že tento produkt může obsaho... 1296967 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Složení: Pšeničná kukuřičná krupice 72%, pšeni... ... 34.9 CZK NaN NaN 89,49 CZK 300101045.0 kg NaN False
112 NaN NaN [{'id': 300109000}, {'id': 300109051}, {'id': ... dle aktuální nabídky 723202123 Plivete při čištění zubů krev?\nKrvácení dásní... 1297325 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Sodium bicarbonate\nAqua\nGlycerin\nAlcohol\nC... ... 84.9 CZK NaN NaN 1132 CZK 300109052.0 l NaN False
421 NaN NaN [{'id': 75609}, {'id': 134771}, {'id': 134779}... Německo 2138809376 Balení 50 ml\nPřirozená prevence zubního kazu ... 1298307 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Glycerin, voda, oxid křemičitý, algin, extrakt... ... 129.9 CZK NaN NaN 2598 CZK 300109052.0 l NaN False
566 NaN NaN [{'id': 134253}, {'id': 75609}, {'id': 134771}... dle aktuální nabídky 4141570066 Nepěnivá zubní pasta s vysokou čistící schopno... 1299049 https://1564005101.rsc.cdn77.org/images/grocer... https://1564005101.rsc.cdn77.org/images/grocer... Bikarbonát sodný, voda, glycerin, oxid křemiči... ... 139.9 CZK NaN NaN 1865,33 CZK 300109052.0 l NaN False

5 rows × 23 columns

Uloha 6

Zgrup produkty podla zeme a zrataj pocet produktov pre vybranu zem.

SQL riesenie

SELECT COUNT(*), "country_of_origin"
FROM "dfProducts"
GROUP BY "country_of_origin";

In [14]:
u6 = dfProducts.groupby("country_of_origin").size()
u6.head()
Out[14]:
country_of_origin
Argentina     5
Arménie       1
Austrálie     6
Barbados      3
Belgie       83
dtype: int64

Uloha 7

Zrataj a zgrup priemernu popularitu vsetkych produktov podla krajiny, pre Cesko.

SQL riesenie

SELECT AVG("popularity") as "avg_pop"
FROM "dfProducts"
WHERE "country_of_origin" = "Česká republika"
GROUP BY "country_of_origin";

ak by sme chceli napriklad vsetky krajiny kde je priemerna popularity vyssia ako 10
SELECT AVG("popularity") as "avg_pop"
FROM "dfProducts"
GROUP BY "country_of_origin"
HAVING "avg_pop" > 10;

In [15]:
u7 = dfProducts[dfProducts["country_of_origin"] == "Česká republika"].groupby("country_of_origin").agg({'popularity' : np.mean})
u7.head()
Out[15]:
popularity
country_of_origin
Česká republika -246.318223

Uloha 8

Aktualizuj vsetky hodnoty, v stlpci popularity, kde popularity je zaporne cislo na jeho absolutnu hodnotu.

SQL riesenie

UPDATE "dfProducts"
SET "popularity" = ABS("popularity");

In [16]:
u8 = dfProducts.loc[dfProducts["popularity"] < 0, 'popularity'] = abs(dfProducts["popularity"])
u8.head()
Out[16]:
0     9.0
1     8.0
2     4.0
3    13.0
4    13.0
Name: popularity, dtype: float64