데이터 전처리
데이터 전처리의 기본
garbage Data —[ Great Model ]—> Garbage Results
데이터 전처리의 주요 과정
데이터 전처리 수행 프로세스***(중요)***
1.중복값 제거 및 결측치 처리 -> 2.이상치 처리 -> 3.Feature Engineering
주요 목적 : 더 나은 분석 결과 도출 및 모형 성능 개선 실현
pandas 기본자료형
숫자, 문자 모두 들어간다.
중복이 불가능하다.
판다스 라이브러리 불러오기 1 2 import pandas as pdprint (pd.__version__)
1.3.5
테스트 1 2 3 4 5 temp_dic = {"col1" : [1 ,2 ,3 ], "col2" : [3 , 4 , 5 ]} df = pd.DataFrame(temp_dic) df
col1
col2
0
1
3
1
2
4
2
3
5
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-5f12a67d-363f-495e-a8d6-e15402e0c5d6 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-5f12a67d-363f-495e-a8d6-e15402e0c5d6');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
<class 'pandas.core.frame.DataFrame'>
1 2 3 4 temp_dic = {'a' :1 , 'b' :2 , 'c' :3 } ser = pd.Series(temp_dic) print (ser)print (type (ser))
a 1
b 2
c 3
dtype: int64
<class 'pandas.core.series.Series'>
구글 드라이브 연동
구글 드라이브 → colab notebook → 새 폴더 생성 : data → 슬랙에서 다운 받은 lemonade.csv 파일을 올린다 -> 다음 코드를 실행
1 2 from google.colab import drivedrive.mount('/content/drive' )
Mounted at /content/drive
Mounted at ..drive 가 출력되었으므로 성공
현재 좌측에 폴더 그림 -> drive -> mydrive -> Colab Notebooks -> data -> Lemonade2016.csv를 찾아서 우클릭 -> 경로 복사 -> 다음 코드에 붙여넣어 사용
1 2 3 DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/Lemonade2016.csv' juice = pd.read_csv(DATA_PATH) juice
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
0
7/1/2016
Park
97
67
70
90.0
0.25
1
7/2/2016
Park
98
67
72
90.0
0.25
2
7/3/2016
Park
110
77
71
104.0
0.25
3
7/4/2016
Beach
134
99
76
98.0
0.25
4
7/5/2016
Beach
159
118
78
135.0
0.25
5
7/6/2016
Beach
103
69
82
90.0
0.25
6
7/6/2016
Beach
103
69
82
90.0
0.25
7
7/7/2016
Beach
143
101
81
135.0
0.25
8
NaN
Beach
123
86
82
113.0
0.25
9
7/9/2016
Beach
134
95
80
126.0
0.25
10
7/10/2016
Beach
140
98
82
131.0
0.25
11
7/11/2016
Beach
162
120
83
135.0
0.25
12
7/12/2016
Beach
130
95
84
99.0
0.25
13
7/13/2016
Beach
109
75
77
99.0
0.25
14
7/14/2016
Beach
122
85
78
113.0
0.25
15
7/15/2016
Beach
98
62
75
108.0
0.50
16
7/16/2016
Beach
81
50
74
90.0
0.50
17
7/17/2016
Beach
115
76
77
126.0
0.50
18
7/18/2016
Park
131
92
81
122.0
0.50
19
7/19/2016
Park
122
85
78
113.0
0.50
20
7/20/2016
Park
71
42
70
NaN
0.50
21
7/21/2016
Park
83
50
77
90.0
0.50
22
7/22/2016
Park
112
75
80
108.0
0.50
23
7/23/2016
Park
120
82
81
117.0
0.50
24
7/24/2016
Park
121
82
82
117.0
0.50
25
7/25/2016
Park
156
113
84
135.0
0.50
26
7/26/2016
Park
176
129
83
158.0
0.35
27
7/27/2016
Park
104
68
80
99.0
0.35
28
7/28/2016
Park
96
63
82
90.0
0.35
29
7/29/2016
Park
100
66
81
95.0
0.35
30
7/30/2016
Beach
88
57
82
81.0
0.35
31
7/31/2016
Beach
76
47
82
68.0
0.35
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-e4ed5b94-20e7-42ba-9f65-459f54e1728a button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-e4ed5b94-20e7-42ba-9f65-459f54e1728a');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
데이터를 불러왔다.
첫번째 파악해야 하는 것!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 31 non-null object
1 Location 32 non-null object
2 Lemon 32 non-null int64
3 Orange 32 non-null int64
4 Temperature 32 non-null int64
5 Leaflets 31 non-null float64
6 Price 32 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 1.9+ KB
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
0
7/1/2016
Park
97
67
70
90.0
0.25
1
7/2/2016
Park
98
67
72
90.0
0.25
2
7/3/2016
Park
110
77
71
104.0
0.25
3
7/4/2016
Beach
134
99
76
98.0
0.25
4
7/5/2016
Beach
159
118
78
135.0
0.25
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-c3238942-1033-4010-80b8-10e94c66dc23 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-c3238942-1033-4010-80b8-10e94c66dc23');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
27
7/27/2016
Park
104
68
80
99.0
0.35
28
7/28/2016
Park
96
63
82
90.0
0.35
29
7/29/2016
Park
100
66
81
95.0
0.35
30
7/30/2016
Beach
88
57
82
81.0
0.35
31
7/31/2016
Beach
76
47
82
68.0
0.35
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-cc60af2a-dd96-48c1-9398-546b4a947c77 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-cc60af2a-dd96-48c1-9398-546b4a947c77');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
Lemon
Orange
Temperature
Leaflets
Price
count
32.000000
32.000000
32.000000
31.000000
32.000000
mean
116.156250
80.000000
78.968750
108.548387
0.354687
std
25.823357
21.863211
4.067847
20.117718
0.113137
min
71.000000
42.000000
70.000000
68.000000
0.250000
25%
98.000000
66.750000
77.000000
90.000000
0.250000
50%
113.500000
76.500000
80.500000
108.000000
0.350000
75%
131.750000
95.000000
82.000000
124.000000
0.500000
max
176.000000
129.000000
84.000000
158.000000
0.500000
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-bfd69db7-f9d2-49ea-84ed-2989ca9e02a8 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-bfd69db7-f9d2-49ea-84ed-2989ca9e02a8');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
pandas.core.frame.DataFrame
1 2 print (juice['Location' ].value_counts())print (type (juice['Location' ].value_counts()))
Beach 17
Park 15
Name: Location, dtype: int64
<class 'pandas.core.series.Series'>
데이터 다뤄보기
1 2 juice['Sold' ] = 0 print (juice.head(3 ))
Date Location Lemon Orange Temperature Leaflets Price Sold
0 7/1/2016 Park 97 67 70 90.0 0.25 0
1 7/2/2016 Park 98 67 72 90.0 0.25 0
2 7/3/2016 Park 110 77 71 104.0 0.25 0
1 2 juice['Sold' ] = juice['Lemon' ] + juice['Orange' ] print (juice.head(3 ))
Date Location Lemon Orange Temperature Leaflets Price Sold
0 7/1/2016 Park 97 67 70 90.0 0.25 164
1 7/2/2016 Park 98 67 72 90.0 0.25 165
2 7/3/2016 Park 110 77 71 104.0 0.25 187
매출액 = 가격 x 판매량
Revenue 로 만들어보자
1 2 juice['Revenue' ] = juice['Sold' ] * juice['Price' ] print (juice.head(3 ))
Date Location Lemon Orange Temperature Leaflets Price Sold \
0 7/1/2016 Park 97 67 70 90.0 0.25 164
1 7/2/2016 Park 98 67 72 90.0 0.25 165
2 7/3/2016 Park 110 77 71 104.0 0.25 187
Revenue
0 41.00
1 41.25
2 46.75
drop(axis = 0|1)
axis는 ‘축’을 의미한다. 한 축을 따라서 연산이 된다.
axis를 0으로 설정 시, 행(=index)방향으로 drop() 실행
axis를 1로 설정 시, 열방향으로 drop 수행함.
1 2 juice_column_drop = juice.drop('Sold' , axis = 1 ) print (juice_column_drop.head(3 ))
Date Location Lemon Orange Temperature Leaflets Price Revenue
0 7/1/2016 Park 97 67 70 90.0 0.25 41.00
1 7/2/2016 Park 98 67 72 90.0 0.25 41.25
2 7/3/2016 Park 110 77 71 104.0 0.25 46.75
1 2 juice_row_drop = juice.drop(0 , axis = 0 ) print (juice_row_drop.head(3 ))
Date Location Lemon Orange Temperature Leaflets Price Sold \
1 7/2/2016 Park 98 67 72 90.0 0.25 165
2 7/3/2016 Park 110 77 71 104.0 0.25 187
3 7/4/2016 Beach 134 99 76 98.0 0.25 233
Revenue
1 41.25
2 46.75
3 58.25
데이터 인덱싱
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
0
7/1/2016
Park
97
67
70
90.0
0.25
164
41.00
1
7/2/2016
Park
98
67
72
90.0
0.25
165
41.25
2
7/3/2016
Park
110
77
71
104.0
0.25
187
46.75
3
7/4/2016
Beach
134
99
76
98.0
0.25
233
58.25
4
7/5/2016
Beach
159
118
78
135.0
0.25
277
69.25
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-bfa3fabe-e933-4527-879f-12c188c0b8bd button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-bfa3fabe-e933-4527-879f-12c188c0b8bd');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
boolean 값을 활용한 데이터 추출 1 2 3 juice[juice['Location' ] == "Beach" ]
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
3
7/4/2016
Beach
134
99
76
98.0
0.25
233
58.25
4
7/5/2016
Beach
159
118
78
135.0
0.25
277
69.25
5
7/6/2016
Beach
103
69
82
90.0
0.25
172
43.00
6
7/6/2016
Beach
103
69
82
90.0
0.25
172
43.00
7
7/7/2016
Beach
143
101
81
135.0
0.25
244
61.00
8
NaN
Beach
123
86
82
113.0
0.25
209
52.25
9
7/9/2016
Beach
134
95
80
126.0
0.25
229
57.25
10
7/10/2016
Beach
140
98
82
131.0
0.25
238
59.50
11
7/11/2016
Beach
162
120
83
135.0
0.25
282
70.50
12
7/12/2016
Beach
130
95
84
99.0
0.25
225
56.25
13
7/13/2016
Beach
109
75
77
99.0
0.25
184
46.00
14
7/14/2016
Beach
122
85
78
113.0
0.25
207
51.75
15
7/15/2016
Beach
98
62
75
108.0
0.50
160
80.00
16
7/16/2016
Beach
81
50
74
90.0
0.50
131
65.50
17
7/17/2016
Beach
115
76
77
126.0
0.50
191
95.50
30
7/30/2016
Beach
88
57
82
81.0
0.35
145
50.75
31
7/31/2016
Beach
76
47
82
68.0
0.35
123
43.05
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-f23f5092-ba57-4126-bdd5-ecc3581c90cd button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-f23f5092-ba57-4126-bdd5-ecc3581c90cd');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 2 3 juice[juice['Leaflets' ] >= 100 ]
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
2
7/3/2016
Park
110
77
71
104.0
0.25
187
46.75
4
7/5/2016
Beach
159
118
78
135.0
0.25
277
69.25
7
7/7/2016
Beach
143
101
81
135.0
0.25
244
61.00
8
NaN
Beach
123
86
82
113.0
0.25
209
52.25
9
7/9/2016
Beach
134
95
80
126.0
0.25
229
57.25
10
7/10/2016
Beach
140
98
82
131.0
0.25
238
59.50
11
7/11/2016
Beach
162
120
83
135.0
0.25
282
70.50
14
7/14/2016
Beach
122
85
78
113.0
0.25
207
51.75
15
7/15/2016
Beach
98
62
75
108.0
0.50
160
80.00
17
7/17/2016
Beach
115
76
77
126.0
0.50
191
95.50
18
7/18/2016
Park
131
92
81
122.0
0.50
223
111.50
19
7/19/2016
Park
122
85
78
113.0
0.50
207
103.50
22
7/22/2016
Park
112
75
80
108.0
0.50
187
93.50
23
7/23/2016
Park
120
82
81
117.0
0.50
202
101.00
24
7/24/2016
Park
121
82
82
117.0
0.50
203
101.50
25
7/25/2016
Park
156
113
84
135.0
0.50
269
134.50
26
7/26/2016
Park
176
129
83
158.0
0.35
305
106.75
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-080b31c4-9d87-4d46-a98d-5d6eec44b68f button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-080b31c4-9d87-4d46-a98d-5d6eec44b68f');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
iloc vs loc
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
0
7/1/2016
Park
97
67
70
90.0
0.25
164
41.00
1
7/2/2016
Park
98
67
72
90.0
0.25
165
41.25
2
7/3/2016
Park
110
77
71
104.0
0.25
187
46.75
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-dadb1a11-c681-42a1-9b8b-85510d760ea0 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-dadb1a11-c681-42a1-9b8b-85510d760ea0');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
%%time
1 2 3 %%time juice.iloc[0 :3 , 0 :2 ]
CPU times: user 2.14 ms, sys: 0 ns, total: 2.14 ms
Wall time: 3.19 ms
Date
Location
0
7/1/2016
Park
1
7/2/2016
Park
2
7/3/2016
Park
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-dfd7fd78-f2b8-491f-a422-bd0e37bc0297 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-dfd7fd78-f2b8-491f-a422-bd0e37bc0297');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 2 3 %%time juice.loc[0 :2 , ['Date' , 'Location' ]]
CPU times: user 1.64 ms, sys: 0 ns, total: 1.64 ms
Wall time: 1.62 ms
Date
Location
0
7/1/2016
Park
1
7/2/2016
Park
2
7/3/2016
Park
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-f63e27df-425e-4e52-acd7-6c213c3c886a button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-f63e27df-425e-4e52-acd7-6c213c3c886a');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 2 3 juice.loc[juice['Leaflets' ] >= 100 , ['Date' , 'Location' ]]
Date
Location
2
7/3/2016
Park
4
7/5/2016
Beach
7
7/7/2016
Beach
8
NaN
Beach
9
7/9/2016
Beach
10
7/10/2016
Beach
11
7/11/2016
Beach
14
7/14/2016
Beach
15
7/15/2016
Beach
17
7/17/2016
Beach
18
7/18/2016
Park
19
7/19/2016
Park
22
7/22/2016
Park
23
7/23/2016
Park
24
7/24/2016
Park
25
7/25/2016
Park
26
7/26/2016
Park
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-80c7ea4c-f3b6-416c-90a6-935ca4d10c87 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-80c7ea4c-f3b6-416c-90a6-935ca4d10c87');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 juice.iloc[juice['Leaflets' ] >= 100 , 0 :2 ]
정렬
1 2 juice.sort_values(by=['Revenue' ], ascending=False ).head(3 )
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
25
7/25/2016
Park
156
113
84
135.0
0.50
269
134.50
18
7/18/2016
Park
131
92
81
122.0
0.50
223
111.50
26
7/26/2016
Park
176
129
83
158.0
0.35
305
106.75
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-d4ef77c6-8bab-4eae-9f75-40bfaf70f3b7 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-d4ef77c6-8bab-4eae-9f75-40bfaf70f3b7');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 2 3 4 5 6 juice.sort_values(by=['Price' , 'Temperature' ], ascending=False )
Date
Location
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
25
7/25/2016
Park
156
113
84
135.0
0.50
269
134.50
24
7/24/2016
Park
121
82
82
117.0
0.50
203
101.50
18
7/18/2016
Park
131
92
81
122.0
0.50
223
111.50
23
7/23/2016
Park
120
82
81
117.0
0.50
202
101.00
22
7/22/2016
Park
112
75
80
108.0
0.50
187
93.50
19
7/19/2016
Park
122
85
78
113.0
0.50
207
103.50
17
7/17/2016
Beach
115
76
77
126.0
0.50
191
95.50
21
7/21/2016
Park
83
50
77
90.0
0.50
133
66.50
15
7/15/2016
Beach
98
62
75
108.0
0.50
160
80.00
16
7/16/2016
Beach
81
50
74
90.0
0.50
131
65.50
20
7/20/2016
Park
71
42
70
NaN
0.50
113
56.50
26
7/26/2016
Park
176
129
83
158.0
0.35
305
106.75
28
7/28/2016
Park
96
63
82
90.0
0.35
159
55.65
30
7/30/2016
Beach
88
57
82
81.0
0.35
145
50.75
31
7/31/2016
Beach
76
47
82
68.0
0.35
123
43.05
29
7/29/2016
Park
100
66
81
95.0
0.35
166
58.10
27
7/27/2016
Park
104
68
80
99.0
0.35
172
60.20
12
7/12/2016
Beach
130
95
84
99.0
0.25
225
56.25
11
7/11/2016
Beach
162
120
83
135.0
0.25
282
70.50
5
7/6/2016
Beach
103
69
82
90.0
0.25
172
43.00
6
7/6/2016
Beach
103
69
82
90.0
0.25
172
43.00
8
NaN
Beach
123
86
82
113.0
0.25
209
52.25
10
7/10/2016
Beach
140
98
82
131.0
0.25
238
59.50
7
7/7/2016
Beach
143
101
81
135.0
0.25
244
61.00
9
7/9/2016
Beach
134
95
80
126.0
0.25
229
57.25
4
7/5/2016
Beach
159
118
78
135.0
0.25
277
69.25
14
7/14/2016
Beach
122
85
78
113.0
0.25
207
51.75
13
7/13/2016
Beach
109
75
77
99.0
0.25
184
46.00
3
7/4/2016
Beach
134
99
76
98.0
0.25
233
58.25
1
7/2/2016
Park
98
67
72
90.0
0.25
165
41.25
2
7/3/2016
Park
110
77
71
104.0
0.25
187
46.75
0
7/1/2016
Park
97
67
70
90.0
0.25
164
41.00
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-14051fd5-627b-4ebe-ab05-3415f55cc7f3 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-14051fd5-627b-4ebe-ab05-3415f55cc7f3');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
Group by
R dplyr groupby() %>% summarize()
-> 데이터 요약 -> 엑셀로 피벗 테이블
1 2 juice.groupby(by = 'Location' ).count()
Date
Lemon
Orange
Temperature
Leaflets
Price
Sold
Revenue
Location
Beach
16
17
17
17
17
17
17
17
Park
15
15
15
15
14
15
15
15
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-26afeca1-6bb7-494f-ba2d-92aab015b058 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-26afeca1-6bb7-494f-ba2d-92aab015b058');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 2 3 4 import numpy as npjuice.groupby(['Location' ])['Revenue' ].agg([max , min , sum , np.mean])
max
min
sum
mean
Location
Beach
95.5
43.0
1002.8
58.988235
Park
134.5
41.0
1178.2
78.546667
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-650575f1-c764-4097-b860-3fa1b26021c5 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-650575f1-c764-4097-b860-3fa1b26021c5');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
1 2 3 4 import numpy as npjuice.groupby(['Location' ])['Revenue' , 'Lemon' ].agg([max , min , sum , np.mean])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
after removing the cwd from sys.path.
Revenue
Lemon
max
min
sum
mean
max
min
sum
mean
Location
Beach
95.5
43.0
1002.8
58.988235
162
76
2020
118.823529
Park
134.5
41.0
1178.2
78.546667
176
71
1697
113.133333
<svg xmlns=”http://www.w3.org/2000/svg" height=”24px”viewBox=”0 0 24 24” width=”24px”>
<script>
const buttonEl =
document.querySelector('#df-7a3b6989-de2d-4a76-8bd8-66538dc5863c button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-7a3b6989-de2d-4a76-8bd8-66538dc5863c');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
</div>
You need to set install_url
to use ShareThis. Please set it in _config.yml
.
Comments You forgot to set the shortname
for Disqus. Please set it in _config.yml
.