pandas_tutorial_02

  • 라이브러리 불러오기
1
2
import pandas as pd
print(pd.__version__)
1.3.5

구글 드라이브 연동

  • 구글 드라이브 → colab notebook → 새 폴더 생성 : data → 슬랙에서 다운 받은 lemonade.csv 파일을 올린다 -> 다음 코드를 실행
1
2
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Mounted at ..drive 가 출력되었으므로 성공

현재 좌측에 폴더 그림 -> drive -> mydrive -> Colab Notebooks -> data -> supermarket_sales.csv를 찾아서 우클릭 -> 경로 복사 -> 다음 코드에 붙여넣어 사용

1
2
3
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/supermarket_sales.csv'
sales = pd.read_csv(DATA_PATH)
sales

Invoice ID Branch City Customer type Gender Product line Unit price Quantity Date Time Payment
0 750-67-8428 A Yangon Member Female Health and beauty 74.69 7 1/5/2019 13:08 Ewallet
1 226-31-3081 C Naypyitaw Normal Female Electronic accessories 15.28 5 3/8/2019 10:29 Cash
2 631-41-3108 A Yangon Normal Male Home and lifestyle 46.33 7 3/3/2019 13:23 Credit card
3 123-19-1176 A Yangon Member Male Health and beauty 58.22 8 1/27/2019 20:33 Ewallet
4 373-73-7910 A Yangon Normal Male Sports and travel 86.31 7 2/8/2019 10:37 Ewallet
... ... ... ... ... ... ... ... ... ... ... ...
995 233-67-5758 C Naypyitaw Normal Male Health and beauty 40.35 1 1/29/2019 13:46 Ewallet
996 303-96-2227 B Mandalay Normal Female Home and lifestyle 97.38 10 3/2/2019 17:16 Ewallet
997 727-02-1313 A Yangon Member Male Food and beverages 31.84 1 2/9/2019 13:22 Cash
998 347-56-2442 A Yangon Normal Male Home and lifestyle 65.82 1 2/22/2019 15:33 Cash
999 849-09-3807 A Yangon Member Female Fashion accessories 88.34 7 2/18/2019 13:28 Cash

1000 rows × 11 columns

  <script>
    const buttonEl =
      document.querySelector('#df-8a1e46d8-83ea-49d2-a98d-cf274f10b34d button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-8a1e46d8-83ea-49d2-a98d-cf274f10b34d');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
sales.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Invoice ID     1000 non-null   object 
 1   Branch         1000 non-null   object 
 2   City           1000 non-null   object 
 3   Customer type  1000 non-null   object 
 4   Gender         1000 non-null   object 
 5   Product line   1000 non-null   object 
 6   Unit price     1000 non-null   float64
 7   Quantity       1000 non-null   int64  
 8   Date           1000 non-null   object 
 9   Time           1000 non-null   object 
 10  Payment        1000 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 86.1+ KB

Group by

  • (동의어) 집계함수를 배운다.
1
2
# 여러가지 시도해보면서 정보를 파악해보자
sales['Invoice ID'].value_counts()
750-67-8428    1
642-61-4706    1
816-72-8853    1
491-38-3499    1
322-02-2271    1
              ..
633-09-3463    1
374-17-3652    1
378-07-7001    1
433-75-6987    1
849-09-3807    1
Name: Invoice ID, Length: 1000, dtype: int64
1
2
# 여러가지 시도해보면서 정보를 파악해보자
sales.groupby('Customer type')['Quantity'].sum()
Customer type
Member    2785
Normal    2725
Name: Quantity, dtype: int64
1
sales.groupby(['Customer type', 'Branch', 'Payment'])['Quantity'].sum()
Customer type  Branch  Payment    
Member         A       Cash           308
                       Credit card    282
                       Ewallet        374
               B       Cash           284
                       Credit card    371
                       Ewallet        269
               C       Cash           293
                       Credit card    349
                       Ewallet        255
Normal         A       Cash           264
                       Credit card    298
                       Ewallet        333
               B       Cash           344
                       Credit card    228
                       Ewallet        324
               C       Cash           403
                       Credit card    194
                       Ewallet        337
Name: Quantity, dtype: int64
  • data type은 Series 이다.
1
print(type(sales.groupby(['Customer type', 'Branch', 'Payment'])['Quantity'].sum()))
<class 'pandas.core.series.Series'>
  • 검색 키워드를 잘 선택하는게 중요하다.
1
sales.groupby(['Customer type', 'Branch', 'Payment'])['Quantity'].agg(['sum', 'mean'])

sum mean
Customer type Branch Payment
Member A Cash 308 5.500000
Credit card 282 5.755102
Ewallet 374 6.032258
B Cash 284 5.358491
Credit card 371 5.888889
Ewallet 269 5.489796
C Cash 293 4.966102
Credit card 349 5.816667
Ewallet 255 5.100000
Normal A Cash 264 4.888889
Credit card 298 5.418182
Ewallet 333 5.203125
B Cash 344 6.035088
Credit card 228 4.956522
Ewallet 324 5.062500
C Cash 403 6.200000
Credit card 194 5.105263
Ewallet 337 6.017857

  <script>
    const buttonEl =
      document.querySelector('#df-9f19e00c-ea81-404c-b289-c9ddb325aeaa button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-9f19e00c-ea81-404c-b289-c9ddb325aeaa');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
print(type(sales.groupby(['Customer type', 'Branch', 'Payment'])['Quantity'].agg(['sum', 'mean'])))
<class 'pandas.core.frame.DataFrame'>
1
sales.groupby(['Customer type', 'Branch', 'Payment'], as_index=False)['Quantity'].agg(['sum', 'mean'])

sum mean
Customer type Branch Payment
Member A Cash 308 5.500000
Credit card 282 5.755102
Ewallet 374 6.032258
B Cash 284 5.358491
Credit card 371 5.888889
Ewallet 269 5.489796
C Cash 293 4.966102
Credit card 349 5.816667
Ewallet 255 5.100000
Normal A Cash 264 4.888889
Credit card 298 5.418182
Ewallet 333 5.203125
B Cash 344 6.035088
Credit card 228 4.956522
Ewallet 324 5.062500
C Cash 403 6.200000
Credit card 194 5.105263
Ewallet 337 6.017857

  <script>
    const buttonEl =
      document.querySelector('#df-f56f9b0d-43e2-4ba0-8abf-56fa96c5d20f button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-f56f9b0d-43e2-4ba0-8abf-56fa96c5d20f');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

결측치 다루기

  • 결측치 데이터 생성
    • 임의로 여러가지 생성해보자 (숙달 과정)
1
2
3
4
5
6
7
8
9
10
11
import pandas as pd
import numpy as np

dict_01 = {
'Score_A' : [80, 90, np.nan, 80],
'Score_B' : [30, 45, np.nan, np.nan],
'Score_C' : [np.nan, 50, 80, 90],
}

df = pd.DataFrame(dict_01)
df

Score_A Score_B Score_C
0 80.0 30.0 NaN
1 90.0 45.0 50.0
2 NaN NaN 80.0
3 80.0 NaN 90.0

  <script>
    const buttonEl =
      document.querySelector('#df-6c9ddc3e-23cb-46c2-bcca-8e0adaab788b button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-6c9ddc3e-23cb-46c2-bcca-8e0adaab788b');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • True = 숫자 1로 인식

  • False = 숫자 0으로 인식

  • 결측치 (Nan) 개수 세기

1
df.isnull().sum()
Score_A    1
Score_B    2
Score_C    1
dtype: int64
  • 결측치를 다른 것으로 채우기
1
df.fillna("0")

Score_A Score_B Score_C
0 80.0 30.0 0
1 90.0 45.0 50.0
2 0 0 80.0
3 80.0 0 90.0

  <script>
    const buttonEl =
      document.querySelector('#df-ce166771-c2da-430d-aa22-8b7cac811d94 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-ce166771-c2da-430d-aa22-8b7cac811d94');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
# 바로 윗칸의 데이터로 채우기
df.fillna(method="pad")

Score_A Score_B Score_C
0 80.0 30.0 NaN
1 90.0 45.0 50.0
2 90.0 45.0 80.0
3 80.0 45.0 90.0

  <script>
    const buttonEl =
      document.querySelector('#df-14c34a17-7745-4466-a779-62f00b5030de button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-14c34a17-7745-4466-a779-62f00b5030de');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
4
5
6
7
dict_01 = {
"성별" : ["남자", "여자", np.nan, "남자"],
"Salary" : [30, 45, 90, 70],
}

df = pd.DataFrame(dict_01)
df

성별 Salary
0 남자 30
1 여자 45
2 NaN 90
3 남자 70

  <script>
    const buttonEl =
      document.querySelector('#df-54d9e838-5824-4cb9-9f0f-8291411d9270 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-54d9e838-5824-4cb9-9f0f-8291411d9270');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
df['성별'].fillna("성별 없음")
0       남자
1       여자
2    성별 없음
3       남자
Name: 성별, dtype: object
  • 결측치

–> 문자열 타입이랑 / 숫자 타입이랑 접근 방법이 다름
–> 문자열(빈도 –> 가장 많이 나타나는 문자열 넣어주기!, 최빈값)
–> 숫자열(평균, 최대, 최소, 중간, 기타 등등..)

1
2
3
4
5
6
7
8
9
10
11
12
import pandas as pd
import numpy as np

dict_01 = {
'Score_A' : [80, 90, np.nan, 80],
'Score_B' : [30, 45, np.nan, np.nan],
'Score_C' : [np.nan, 50, 80, 90],
'Score_D' : [50, 30, 80, 60],
}

df = pd.DataFrame(dict_01)
df

Score_A Score_B Score_C Score_D
0 80.0 30.0 NaN 50
1 90.0 45.0 50.0 30
2 NaN NaN 80.0 80
3 80.0 NaN 90.0 60

  <script>
    const buttonEl =
      document.querySelector('#df-a1c2a0ad-c902-4c13-8d35-ae4931ac7c3d button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-a1c2a0ad-c902-4c13-8d35-ae4931ac7c3d');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 결측치가 있을 때 열을 지운다.
    • axis = 1 -> columns
1
df.dropna(axis = 1)

Score_D
0 50
1 30
2 80
3 60

  <script>
    const buttonEl =
      document.querySelector('#df-a9658aae-24a1-43bd-a6fe-73b30c752e90 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-a9658aae-24a1-43bd-a6fe-73b30c752e90');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 결측치가 있을 때 행을 지운다.
    • axis = 0 -> index
1
df.dropna(axis = 0)

Score_A Score_B Score_C Score_D
1 90.0 45.0 50.0 30

  <script>
    const buttonEl =
      document.querySelector('#df-1359a925-a759-4d30-9b57-10abcaf3af1a button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-1359a925-a759-4d30-9b57-10abcaf3af1a');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

이상치

1
sales

Invoice ID Branch City Customer type Gender Product line Unit price Quantity Date Time Payment
0 750-67-8428 A Yangon Member Female Health and beauty 74.69 7 1/5/2019 13:08 Ewallet
1 226-31-3081 C Naypyitaw Normal Female Electronic accessories 15.28 5 3/8/2019 10:29 Cash
2 631-41-3108 A Yangon Normal Male Home and lifestyle 46.33 7 3/3/2019 13:23 Credit card
3 123-19-1176 A Yangon Member Male Health and beauty 58.22 8 1/27/2019 20:33 Ewallet
4 373-73-7910 A Yangon Normal Male Sports and travel 86.31 7 2/8/2019 10:37 Ewallet
... ... ... ... ... ... ... ... ... ... ... ...
995 233-67-5758 C Naypyitaw Normal Male Health and beauty 40.35 1 1/29/2019 13:46 Ewallet
996 303-96-2227 B Mandalay Normal Female Home and lifestyle 97.38 10 3/2/2019 17:16 Ewallet
997 727-02-1313 A Yangon Member Male Food and beverages 31.84 1 2/9/2019 13:22 Cash
998 347-56-2442 A Yangon Normal Male Home and lifestyle 65.82 1 2/22/2019 15:33 Cash
999 849-09-3807 A Yangon Member Female Fashion accessories 88.34 7 2/18/2019 13:28 Cash

1000 rows × 11 columns

  <script>
    const buttonEl =
      document.querySelector('#df-04da7866-a736-4456-8d77-a7760df771c5 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-04da7866-a736-4456-8d77-a7760df771c5');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 일반적인 통계적 공식

  • IQR - 박스플롯 - 사분위수

  • Q0(0), Q1(25%), Q2(50%), Q3(75%), Q4(100%)

  • 이상치의 하한 경계값 : Q1 - 1.5 * (Q3-Q1)

  • 이상치의 상한 경계값 : Q3 + 1.5 * (Q3-Q1)

  • 도메인 (각 비즈니스 영역, 미래 일자리) 에서 바라보는 이상치 기준 (관습)

1
sales[['Unit price']]. describe()

Unit price
count 1000.000000
mean 55.672130
std 26.494628
min 10.080000
25% 32.875000
50% 55.230000
75% 77.935000
max 99.960000

  <script>
    const buttonEl =
      document.querySelector('#df-6bdae016-4a2d-4f55-9e51-5f68e6af7217 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-6bdae016-4a2d-4f55-9e51-5f68e6af7217');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 이상치의 하한 경계값 : Q1 - 1.5 * (Q3-Q1)
  • 이런 공식은 통계적으로 타당하지만 그 외에도 이상치인지 판단할 방법이 있다.
1
2
3
4
5
6
7
8
Q1 = sales['Unit price'].quantile(0.25)
Q3 = sales['Unit price'].quantile(0.75)

# Q1보다 낮은 값을 이상치로 간주
outliers_q1 = (sales['Unit price'] < Q1)

# Q1보다 높은 값을 이상치로 간주
outliers_q3 = (sales['Unit price'] > Q3)
  • 이 코드는 특히 중요하다
1
print(sales['Unit price'][~(outliers_q1 | outliers_q3)])
0      74.69
2      46.33
3      58.22
6      68.84
7      73.56
       ...  
991    76.60
992    58.03
994    60.95
995    40.35
998    65.82
Name: Unit price, Length: 500, dtype: float64

pandas_10minutes

Pandas 10분 완성

https://dataitgirls2.github.io/10minutes2pandas/

1
2
3
4
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

1.Object Creation (객체 생성)

  • Pandas는 값을 가지고 있는 리스트를 통해 Series를 만들고, 정수로 만들어진 인덱스를 기본값으로 불러온다.
1
2
3
# Series를 이용한 객체 생성
s = pd.Series([1,3,5,np.nan,6,8])
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
  • datetime 인덱스와 레이블이 있는 열을 가지고 있는 numpy 배열을 전달하여 데이터프레임을 만든다.
1
2
3
# date_range()를 이용해 20130101을 포함한 연속적인 6일의 데이터를 넣는다.
dates = pd.date_range('20130101', periods = 6)
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
1
2
3
# 데이터 프레임 생성
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

A B C D
2013-01-01 -0.214371 -0.489334 0.807876 -2.328570
2013-01-02 -0.018762 -0.438046 0.593880 0.671849
2013-01-03 -0.596207 0.081615 0.182117 -2.063007
2013-01-04 -2.044753 -0.853425 1.582471 -0.756233
2013-01-05 0.394973 -0.526762 0.393856 1.550660
2013-01-06 -1.665879 0.184903 1.905710 2.345500

  <script>
    const buttonEl =
      document.querySelector('#df-98ec8384-9a3f-4ee3-9d62-d8f6d1821857 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-98ec8384-9a3f-4ee3-9d62-d8f6d1821857');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • Series와 같은 것으로 변환될 수 있는 객체들의 dict로 구성된 데이터프레임을 만든다.
1
2
3
4
5
6
7
8
df2 = pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
'D' : np.array([3] * 4,dtype = 'int32'),
'E' : pd.Categorical(["test", "train", "test", "train"]),
'F' : 'foo'})

df2

A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo

  <script>
    const buttonEl =
      document.querySelector('#df-32a9a2b4-301b-48af-8afa-569444b4838a button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-32a9a2b4-301b-48af-8afa-569444b4838a');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 데이터프레임 결과물의 열은 다양한 데이터 타입 (dtypes) 으로 구성
1
df2.dtypes
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

2.Viewing Data (데이터 확인하기)

  • 데이터프레임의 가장 윗 줄과 마지막 줄을 확인하고 싶을 때에 사용하는 방법은 다음과 같다.
1
2
df.tail(3)  # 끝에서부터 3줄을 출력
df.tail() # 끝에서부터 5줄을 출력. 디폴트값이 5이다

A B C D
2013-01-02 -0.018762 -0.438046 0.593880 0.671849
2013-01-03 -0.596207 0.081615 0.182117 -2.063007
2013-01-04 -2.044753 -0.853425 1.582471 -0.756233
2013-01-05 0.394973 -0.526762 0.393856 1.550660
2013-01-06 -1.665879 0.184903 1.905710 2.345500

  <script>
    const buttonEl =
      document.querySelector('#df-b90adfec-6c56-41b4-a15c-699d26a51033 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-b90adfec-6c56-41b4-a15c-699d26a51033');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
df.head() # 처음 5줄을 출력.

A B C D
2013-01-01 -0.214371 -0.489334 0.807876 -2.328570
2013-01-02 -0.018762 -0.438046 0.593880 0.671849
2013-01-03 -0.596207 0.081615 0.182117 -2.063007
2013-01-04 -2.044753 -0.853425 1.582471 -0.756233
2013-01-05 0.394973 -0.526762 0.393856 1.550660

  <script>
    const buttonEl =
      document.querySelector('#df-1d5aaccf-3123-46cc-a71a-b7f235656a2f button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-1d5aaccf-3123-46cc-a71a-b7f235656a2f');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 인덱스(index), 열(column) 그리고 numpy 데이터에 대한 세부 정보를 표시
1
df.index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
1
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
1
df.values
array([[-0.21437119, -0.48933404,  0.80787593, -2.32856993],
       [-0.01876194, -0.43804563,  0.59387975,  0.67184854],
       [-0.59620717,  0.08161493,  0.18211706, -2.06300731],
       [-2.0447528 , -0.85342539,  1.58247067, -0.75623263],
       [ 0.39497306, -0.52676189,  0.39385602,  1.55066002],
       [-1.66587853,  0.18490331,  1.9057098 ,  2.34549952]])
  • describe()는 데이터의 대략적인 통계적 정보 요약을 보여준다.
1
df.describe()

A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.690833 -0.340175 0.910985 -0.096634
std 0.964410 0.395899 0.685599 1.926208
min -2.044753 -0.853425 0.182117 -2.328570
25% -1.398461 -0.517405 0.443862 -1.736314
50% -0.405289 -0.463690 0.700878 -0.042192
75% -0.067664 -0.048300 1.388822 1.330957
max 0.394973 0.184903 1.905710 2.345500

  <script>
    const buttonEl =
      document.querySelector('#df-bc50d2c1-8824-4181-b8ce-c7aecf4a30e1 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-bc50d2c1-8824-4181-b8ce-c7aecf4a30e1');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 데이터를 전치한다.
  • 즉, 두 축을 서로 바꾼다
1
df.T

2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A -0.214371 -0.018762 -0.596207 -2.044753 0.394973 -1.665879
B -0.489334 -0.438046 0.081615 -0.853425 -0.526762 0.184903
C 0.807876 0.593880 0.182117 1.582471 0.393856 1.905710
D -2.328570 0.671849 -2.063007 -0.756233 1.550660 2.345500

  <script>
    const buttonEl =
      document.querySelector('#df-cfc48a53-c108-4b41-bf62-f85fc6923d20 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-cfc48a53-c108-4b41-bf62-f85fc6923d20');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 축 별로 정렬한다.
1
2
df.sort_index(axis=1, ascending=False)
# axis=1 일때, 열방향으로 실행한다.

D C B A
2013-01-01 -2.328570 0.807876 -0.489334 -0.214371
2013-01-02 0.671849 0.593880 -0.438046 -0.018762
2013-01-03 -2.063007 0.182117 0.081615 -0.596207
2013-01-04 -0.756233 1.582471 -0.853425 -2.044753
2013-01-05 1.550660 0.393856 -0.526762 0.394973
2013-01-06 2.345500 1.905710 0.184903 -1.665879

  <script>
    const buttonEl =
      document.querySelector('#df-2b34e281-4cce-48fe-b140-c57dc7828ca4 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-2b34e281-4cce-48fe-b140-c57dc7828ca4');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 값 별로 정렬한다.
1
df.sort_values(by='B')

A B C D
2013-01-04 -2.044753 -0.853425 1.582471 -0.756233
2013-01-05 0.394973 -0.526762 0.393856 1.550660
2013-01-01 -0.214371 -0.489334 0.807876 -2.328570
2013-01-02 -0.018762 -0.438046 0.593880 0.671849
2013-01-03 -0.596207 0.081615 0.182117 -2.063007
2013-01-06 -1.665879 0.184903 1.905710 2.345500

  <script>
    const buttonEl =
      document.querySelector('#df-30e7da7b-4a36-43a6-b2c3-a5a382f252de button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-30e7da7b-4a36-43a6-b2c3-a5a382f252de');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

3.Selection (선택)

  • 주석 (Note) : 선택과 설정을 위한 Python / Numpy의 표준화된 표현들이 직관적이며, 코드 작성을 위한 양방향 작업에 유용하지만 우리는 Pandas에 최적화된 데이터 접근 방법인 .at, .iat, .loc 및 .iloc 을 추천.

  • Getting (데이터 얻기)

    • df.A 와 동일한 Series를 생성하는 단일 열을 선택
1
df['A']
2013-01-01   -0.214371
2013-01-02   -0.018762
2013-01-03   -0.596207
2013-01-04   -2.044753
2013-01-05    0.394973
2013-01-06   -1.665879
Freq: D, Name: A, dtype: float64
  • 행을 분할하는 [ ]를 통해 선택한다.
1
df[0:3]

A B C D
2013-01-01 0.654753 -0.366034 -1.440226 -1.043957
2013-01-02 1.589167 0.321939 1.393342 0.898153
2013-01-03 0.270879 0.107423 -2.032053 1.861947

  <script>
    const buttonEl =
      document.querySelector('#df-42a08968-1f35-45a3-bca9-318f73f513c6 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-42a08968-1f35-45a3-bca9-318f73f513c6');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
df['20130102':'20130104']

A B C D
2013-01-02 -0.018762 -0.438046 0.593880 0.671849
2013-01-03 -0.596207 0.081615 0.182117 -2.063007
2013-01-04 -2.044753 -0.853425 1.582471 -0.756233

  <script>
    const buttonEl =
      document.querySelector('#df-ebb43e9b-1f8f-4438-93a1-e004bc46108e button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-ebb43e9b-1f8f-4438-93a1-e004bc46108e');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Selection by Label (Label을 통한 선택)

  • 라벨을 사용하여 횡단면을 얻는다.
1
df.loc[dates[0]]
A   -0.214371
B   -0.489334
C    0.807876
D   -2.328570
Name: 2013-01-01 00:00:00, dtype: float64
  • 라벨을 사용하여 여러 축의 데이터를 획득한다.
1
df.loc[:,['A','B']]

A B
2013-01-01 -0.214371 -0.489334
2013-01-02 -0.018762 -0.438046
2013-01-03 -0.596207 0.081615
2013-01-04 -2.044753 -0.853425
2013-01-05 0.394973 -0.526762
2013-01-06 -1.665879 0.184903

  <script>
    const buttonEl =
      document.querySelector('#df-0d130634-6f70-4a3b-a931-3907cbf47eb6 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-0d130634-6f70-4a3b-a931-3907cbf47eb6');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 양쪽 종단점을 포함한 라벨 슬라이싱을 표시.
1
df.loc['20130102':'20130104', ['A','B']]

A B
2013-01-02 -0.018762 -0.438046
2013-01-03 -0.596207 0.081615
2013-01-04 -2.044753 -0.853425

  <script>
    const buttonEl =
      document.querySelector('#df-80998ed9-78c8-48ef-84e6-b2e58d3420c3 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-80998ed9-78c8-48ef-84e6-b2e58d3420c3');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 반환되는 객체의 차원를 줄인다.
1
df.loc['20130102',['A','B']]
A   -0.018762
B   -0.438046
Name: 2013-01-02 00:00:00, dtype: float64
  • 스칼라 값을 얻는다.
1
df.loc[dates[0], 'A']
-0.21437119207750993
  • 스칼라 값을 더 빠르게 구하는 방법 (앞선 메소드와 동일하다)
1
df.at[dates[0], 'A']
-0.21437119207750993

Selection by Position(위치로 선택하기)

  • 넘겨받은 정수의 위치를 기준으로 선택.
1
df.iloc[3]
A   -2.044753
B   -0.853425
C    1.582471
D   -0.756233
Name: 2013-01-04 00:00:00, dtype: float64
  • 정수로 표기된 슬라이스들을 통해, numpy / python과 유사하게 작동.
1
df.iloc[3:5, 0:2]

A B
2013-01-04 -2.044753 -0.853425
2013-01-05 0.394973 -0.526762

  <script>
    const buttonEl =
      document.querySelector('#df-a6c9e7cd-47c2-42bc-8726-b9b8dbf72bb9 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-a6c9e7cd-47c2-42bc-8726-b9b8dbf72bb9');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 정수로 표기된 위치값의 리스트를 통하여 numpy / python 의 스타일과 유사해진다.
1
df.iloc[[1, 2, 4], [0, 2]]

A C
2013-01-02 -0.018762 0.593880
2013-01-03 -0.596207 0.182117
2013-01-05 0.394973 0.393856

  <script>
    const buttonEl =
      document.querySelector('#df-1d8279d9-f642-48e7-8d6a-5006bade95b3 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-1d8279d9-f642-48e7-8d6a-5006bade95b3');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 다음은 명시적으로 행을 나누고자 하는 경우이다
    • 즉, 한쪽을 공백으로 둘 경우
1
df.iloc[1:3, : ]

A B C D
2013-01-02 -0.018762 -0.438046 0.593880 0.671849
2013-01-03 -0.596207 0.081615 0.182117 -2.063007

  <script>
    const buttonEl =
      document.querySelector('#df-f8cec5e2-bbc6-4151-9e24-a55e5e4cd2b8 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-f8cec5e2-bbc6-4151-9e24-a55e5e4cd2b8');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 다음은 명시적으로 열을 나누고자 하는 경우이다
    • 즉, 한쪽을 공백으로 둘 경우
1
df.iloc[ : , 1:3]

B C
2013-01-01 -0.489334 0.807876
2013-01-02 -0.438046 0.593880
2013-01-03 0.081615 0.182117
2013-01-04 -0.853425 1.582471
2013-01-05 -0.526762 0.393856
2013-01-06 0.184903 1.905710

  <script>
    const buttonEl =
      document.querySelector('#df-e2e93a02-91f2-43f7-a503-bec4339b83b8 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-e2e93a02-91f2-43f7-a503-bec4339b83b8');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 다음은 명시적으로 (특정한) 값을 얻고자 하는 경우이다.
1
df.iloc[1, 1]
-0.43804562902186034
  • 스칼라 값을 빠르게 얻는 방법 (위의 방식과 동일하다)
1
df.iat[1,1]
-0.43804562902186034

Boolean Indexing

  • 데이터를 선택하기 위해 단일 열의 값을 사용
1
df[df.A > 0]

A B C D
2013-01-05 0.394973 -0.526762 0.393856 1.55066

  <script>
    const buttonEl =
      document.querySelector('#df-4f4aed32-1929-4051-a6c9-8f593fb92c84 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-4f4aed32-1929-4051-a6c9-8f593fb92c84');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • Boolean 조건을 충족하는 데이터프레임에서 값을 선택
1
df[df > 0]

A B C D
2013-01-01 NaN NaN 0.807876 NaN
2013-01-02 NaN NaN 0.593880 0.671849
2013-01-03 NaN 0.081615 0.182117 NaN
2013-01-04 NaN NaN 1.582471 NaN
2013-01-05 0.394973 NaN 0.393856 1.550660
2013-01-06 NaN 0.184903 1.905710 2.345500

  <script>
    const buttonEl =
      document.querySelector('#df-35a1e446-5c1c-4a6c-b39d-6ff2b7801761 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-35a1e446-5c1c-4a6c-b39d-6ff2b7801761');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

isin

  • 필터링을 위한 메소드이다.
1
2
3
df2 = df.copy()
df2 = ['one', 'one', 'two', 'three', 'four', 'three']
df2
1
# df2[df2['E'].isin(['two','four'])]

Setting (설정)

  • 새 열을 설정하면 데이터가 인덱스 별로 자동 정렬된다.
1
2
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64
1
df['F'] = s1
  • 라벨에 의해 값을 설정한다.
1
df.at[dates[0], 'A'] = 0
  • 위치에 의해 값을 설정한다.
1
df.iat[0, 1] = 0
  • Numpy 배열을 사용한 할당에 의해 값을 설정한다.
1
df.loc[:, 'D'] = np.array([5] * len(df))
  • 위 설정대로 작동한 결과다.
1
df

A B C D F
2013-01-01 0.000000 0.000000 0.807876 5 NaN
2013-01-02 -0.018762 -0.438046 0.593880 5 1.0
2013-01-03 -0.596207 0.081615 0.182117 5 2.0
2013-01-04 -2.044753 -0.853425 1.582471 5 3.0
2013-01-05 0.394973 -0.526762 0.393856 5 4.0
2013-01-06 -1.665879 0.184903 1.905710 5 5.0

  <script>
    const buttonEl =
      document.querySelector('#df-12bdced2-022e-4cec-87a4-12165a361bd7 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-12bdced2-022e-4cec-87a4-12165a361bd7');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • where 연산을 설정합니다.
1
2
3
df2 = df.copy()
df2[df2 > 0] = -df2
df2

A B C D F
2013-01-01 0.000000 0.000000 -0.807876 -5 NaN
2013-01-02 -0.018762 -0.438046 -0.593880 -5 -1.0
2013-01-03 -0.596207 -0.081615 -0.182117 -5 -2.0
2013-01-04 -2.044753 -0.853425 -1.582471 -5 -3.0
2013-01-05 -0.394973 -0.526762 -0.393856 -5 -4.0
2013-01-06 -1.665879 -0.184903 -1.905710 -5 -5.0

  <script>
    const buttonEl =
      document.querySelector('#df-2e3a7cd9-4958-46ba-a9ca-9fb4031816c0 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-2e3a7cd9-4958-46ba-a9ca-9fb4031816c0');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

4.Missing Data (결측치)

  • Pandas는 결측치를 표현하기 위해 주로 np.nan 값을 사용한다.
    • 이 방법은 기본 설정값이지만 계산에는 포함되지 않는다.
  • Reindexing으로 지정된 축 상의 인덱스를 변경 / 추가 / 삭제 가능. Reindexing은 데이터의 복사본을 반환.
1
2
3
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

A B C D F E
2013-01-01 0.000000 0.000000 0.807876 5 NaN 1.0
2013-01-02 -0.018762 -0.438046 0.593880 5 1.0 1.0
2013-01-03 -0.596207 0.081615 0.182117 5 2.0 NaN
2013-01-04 -2.044753 -0.853425 1.582471 5 3.0 NaN

  <script>
    const buttonEl =
      document.querySelector('#df-bd5a580c-316a-44a4-b5af-496ed1ad2f48 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-bd5a580c-316a-44a4-b5af-496ed1ad2f48');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 결측치를 가지고 있는 행들을 지운다.
1
df1.dropna(how = 'any')

A B C D F E
2013-01-02 -0.018762 -0.438046 0.59388 5 1.0 1.0

  <script>
    const buttonEl =
      document.querySelector('#df-e2491041-b1d4-4902-9dce-0ef94792d204 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-e2491041-b1d4-4902-9dce-0ef94792d204');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 결측치를 채워 넣는다.
1
df1.fillna(value=5)

A B C D F E
2013-01-01 0.000000 0.000000 0.807876 5 5.0 1.0
2013-01-02 -0.018762 -0.438046 0.593880 5 1.0 1.0
2013-01-03 -0.596207 0.081615 0.182117 5 2.0 5.0
2013-01-04 -2.044753 -0.853425 1.582471 5 3.0 5.0

  <script>
    const buttonEl =
      document.querySelector('#df-d9ef038f-e4ba-4702-b903-2a4056ec371e button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-d9ef038f-e4ba-4702-b903-2a4056ec371e');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • nan인 값에 boolean을 통한 표식을 얻는다.
  • 데이터프레임의 모든 값이 boolean 형태로 표시되며, nan 값에만 True를 표시한다.
1
pd.isna(df1)

A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True

  <script>
    const buttonEl =
      document.querySelector('#df-5ac60ecc-0098-4e10-8e23-74a0aa20e121 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-5ac60ecc-0098-4e10-8e23-74a0aa20e121');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

5.Operation (연산)

Stats (통계)

  • 일반적으로 결측치를 제외한 후 연산.
  • 기술통계를 수행한다.
1
df.mean()
A   -0.655105
B   -0.258619
C    0.910985
D    5.000000
F    3.000000
dtype: float64
  • 다른 축에서도 동일한 연산을 수행.
1
df.mean(1)
2013-01-01    1.451969
2013-01-02    1.227414
2013-01-03    1.333505
2013-01-04    1.336858
2013-01-05    1.852413
2013-01-06    2.084947
Freq: D, dtype: float64
  • 정렬이 필요하다. 차원이 다른 객체로 연산해보자.
  • pandas는 지정된 차원을 따라 자동으로 브로드 캐스팅된다.
    • broadcast란 n차원이나 스칼라 값으로 연산을 수행할 때 도출되는 결과의 규칙을 설명하는 것을 의미
1
2
s = pd.Series([1, 3, 4, np.nan, 6, 8], index=dates)
s
2013-01-01    1.0
2013-01-02    3.0
2013-01-03    4.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64
  • 위 코드를 shift로 2칸 옮긴 것
1
2
s = pd.Series([1, 3, 4, np.nan, 6, 8], index=dates).shift(2)
s
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    4.0
2013-01-06    NaN
Freq: D, dtype: float64
  • index를 축(axis)으로 실행
1
df.sub(s, axis='index')   

A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -1.596207 -0.918385 -0.817883 4.0 1.0
2013-01-04 -5.044753 -3.853425 -1.417529 2.0 0.0
2013-01-05 -3.605027 -4.526762 -3.606144 1.0 0.0
2013-01-06 NaN NaN NaN NaN NaN

  <script>
    const buttonEl =
      document.querySelector('#df-99b442a2-cd95-4ff7-886d-18b60cfb80eb button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-99b442a2-cd95-4ff7-886d-18b60cfb80eb');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Apply (적용)

  • 데이터에 함수를 적용한다.
1
df.apply(np.cumsum)

A B C D F
2013-01-01 0.000000 0.000000 0.807876 5 NaN
2013-01-02 -0.018762 -0.438046 1.401756 10 1.0
2013-01-03 -0.614969 -0.356431 1.583873 15 3.0
2013-01-04 -2.659722 -1.209856 3.166343 20 6.0
2013-01-05 -2.264749 -1.736618 3.560199 25 10.0
2013-01-06 -3.930627 -1.551715 5.465909 30 15.0

  <script>
    const buttonEl =
      document.querySelector('#df-5f71bb31-515e-4d3e-b14d-f49a6239f43b button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-5f71bb31-515e-4d3e-b14d-f49a6239f43b');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
df.apply(lambda x: x.max() - x.min())
A    2.439726
B    1.038329
C    1.723593
D    0.000000
F    4.000000
dtype: float64

Histogramming (히스토그래밍)

1
2
s = pd.Series(np.random.randint(0, 7, size=10))   # 랜덤 생성
s
0    1
1    5
2    0
3    1
4    6
5    6
6    5
7    5
8    5
9    1
dtype: int64
1
s.value_counts()
5    4
1    3
6    2
0    1
dtype: int64

String Methods (문자열 메소드)

  • Series는 다음의 코드와 같이 문자열 처리 메소드 모음 (set)을 가지고 있다.
  • 이 모음은 배열의 각 요소를 쉽게 조작할 수 있도록 만들어주는 문자열의 속성에 포함되어 있다.
  • 문자열의 패턴 일치 확인은 기본적으로 정규 표현식을 사용.
1
2
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

6.Merge (병합)

concat (연결)

  • 결합 (join) / 병합 (merge) 형태의 연산에 대한 인덱스, 관계 대수 기능을 위한 다양한 형태의 논리를 포함한 Series, 데이터프레임, Panel 객체를 손쉽게 결합할 수 있도록 하는 다양한 기능을 pandas 에서 제공한다.

  • concat()으로 pandas 객체를 연결한다.

1
2
df = pd.DataFrame(np.random.randn(10, 4))
df

0 1 2 3
0 -0.639128 -0.371715 -2.320589 0.932025
1 -1.041656 0.646479 1.551379 -0.353387
2 -0.782444 0.677232 -0.050054 -0.054370
3 -0.418260 0.673768 -1.694420 0.193668
4 0.788359 -0.308937 -0.314680 -0.058661
5 0.457466 -2.021977 0.611340 -0.538168
6 1.355963 1.295236 -0.399497 -0.052334
7 -0.324138 -0.165932 0.290442 0.531520
8 -0.386876 0.217569 0.926404 -0.813724
9 -0.452338 -0.259533 -0.810046 1.186298

  <script>
    const buttonEl =
      document.querySelector('#df-64ce2183-4d4b-4538-857b-4b2f38748c8b button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-64ce2183-4d4b-4538-857b-4b2f38748c8b');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
4
# break it into pieces
# 조각내고 concat을 통해 다시 연결한다.
pieces = [df[ : 3], df[3 : 7], df[7 : ]]
pd.concat(pieces)

0 1 2 3
0 -0.639128 -0.371715 -2.320589 0.932025
1 -1.041656 0.646479 1.551379 -0.353387
2 -0.782444 0.677232 -0.050054 -0.054370
3 -0.418260 0.673768 -1.694420 0.193668
4 0.788359 -0.308937 -0.314680 -0.058661
5 0.457466 -2.021977 0.611340 -0.538168
6 1.355963 1.295236 -0.399497 -0.052334
7 -0.324138 -0.165932 0.290442 0.531520
8 -0.386876 0.217569 0.926404 -0.813724
9 -0.452338 -0.259533 -0.810046 1.186298

  <script>
    const buttonEl =
      document.querySelector('#df-b9f8c250-4e6a-49c3-8b57-729bd58c514f button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-b9f8c250-4e6a-49c3-8b57-729bd58c514f');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Join (결합)

SQL 방식으로 병합한다.

1
2
3
left = pd.DataFrame({'key' : ['foo', 'foo'], 'lval' : [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left

key lval
0 foo 1
1 foo 2

  <script>
    const buttonEl =
      document.querySelector('#df-5ee0033f-5dc0-4ee1-a4ae-aab4984c9f56 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-5ee0033f-5dc0-4ee1-a4ae-aab4984c9f56');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
right

key rval
0 foo 4
1 foo 5

  <script>
    const buttonEl =
      document.querySelector('#df-7b2e03cd-9c2d-471a-aba2-b4a36968ffa5 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-7b2e03cd-9c2d-471a-aba2-b4a36968ffa5');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 위 두 가지를 병합하기
1
pd.merge(left, right, on = 'key')

key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

  <script>
    const buttonEl =
      document.querySelector('#df-62da57e4-df6e-4fe9-b5a0-3094a8ba0ed2 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-62da57e4-df6e-4fe9-b5a0-3094a8ba0ed2');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Append (추가)

  • 데이터프레임에 행을 추가한다.
1
2
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

A B C D
0 -0.909696 0.146335 -0.568276 -0.434510
1 -0.802681 0.235747 -0.751299 -0.053560
2 2.005541 -1.265754 -1.152046 -0.081151
3 -0.422940 -0.095189 -1.634583 0.180732
4 -1.535375 -0.594391 -1.102247 0.047852
5 0.369960 -0.902356 -1.196501 -0.109521
6 -1.369044 -2.044557 -0.487275 0.267463
7 0.439153 0.003023 -1.716505 -2.119485

  <script>
    const buttonEl =
      document.querySelector('#df-fb76804e-7023-4762-a17b-9f4e79e4d070 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-fb76804e-7023-4762-a17b-9f4e79e4d070');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 3행의 내용을 복사하여 8행을 추가한다.
1
2
s = df.iloc[3]
df.append(s, ignore_index = True)

A B C D
0 -0.909696 0.146335 -0.568276 -0.434510
1 -0.802681 0.235747 -0.751299 -0.053560
2 2.005541 -1.265754 -1.152046 -0.081151
3 -0.422940 -0.095189 -1.634583 0.180732
4 -1.535375 -0.594391 -1.102247 0.047852
5 0.369960 -0.902356 -1.196501 -0.109521
6 -1.369044 -2.044557 -0.487275 0.267463
7 0.439153 0.003023 -1.716505 -2.119485
8 -0.422940 -0.095189 -1.634583 0.180732

  <script>
    const buttonEl =
      document.querySelector('#df-9c5c9e98-661f-4d00-b16c-b0e389424ef0 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-9c5c9e98-661f-4d00-b16c-b0e389424ef0');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

7.Grouping (그룹화)

  • 룹화는 다음 단계 중 하나 이상을 포함하는 과정을 가리킨다.
    • 몇몇 기준에 따라 여러 그룹으로 데이터를 분할 (splitting)
    • 각 그룹에 독립적으로 함수를 적용 (applying)
    • 결과물들을 하나의 데이터 구조로 결합 (combining)
1
2
3
4
5
6
7
8
9
df = pd.DataFrame(
{
'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)
})

df

A B C D
0 foo one 0.144726 0.653074
1 bar one -0.590503 0.128616
2 foo two 1.816665 -1.533646
3 bar three -1.574489 -0.140956
4 foo two 0.103910 1.448011
5 bar two -0.610817 0.742873
6 foo one -1.576850 0.444138
7 foo three 0.857080 0.157513

  <script>
    const buttonEl =
      document.querySelector('#df-eb85beec-778a-44c7-aac7-eb71caf64586 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-eb85beec-778a-44c7-aac7-eb71caf64586');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 생성된 데이터프레임을 그룹화한 후 각 그룹에 sum() 함수를 적용.
1
df.groupby('A').sum()

C D
A
bar -2.775808 0.730534
foo 1.345531 1.169089

  <script>
    const buttonEl =
      document.querySelector('#df-1aa22f34-79db-46e3-ae05-1187c99d6af5 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-1aa22f34-79db-46e3-ae05-1187c99d6af5');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 여러 열을 기준으로 그룹화하면 계층적 인덱스가 형성된다. 여기에도 sum 함수를 적용 가능.
1
df.groupby(['A', 'B']).sum()

C D
A B
bar one -0.590503 0.128616
three -1.574489 -0.140956
two -0.610817 0.742873
foo one -1.432124 1.097212
three 0.857080 0.157513
two 1.920575 -0.085635

  <script>
    const buttonEl =
      document.querySelector('#df-4d41ac7d-2fef-4b0e-a140-e235f321dab0 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-4d41ac7d-2fef-4b0e-a140-e235f321dab0');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

8.Reshaping (변형)

Stack (스택)

1
2
3
4
5
6
7
8
9
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2

A B
first second
bar one -0.827364 -1.346867
two -1.197194 -0.118960
baz one -1.071918 0.825303
two 0.507340 -1.517231

  <script>
    const buttonEl =
      document.querySelector('#df-5a082081-c6ac-484f-9748-3682bcfb55a3 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-5a082081-c6ac-484f-9748-3682bcfb55a3');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • stack() 메소드는 데이터프레임 열들의 계층을 “압축” 한다.
1
2
stacked = df2.stack()
stacked
first  second   
bar    one     A   -0.827364
               B   -1.346867
       two     A   -1.197194
               B   -0.118960
baz    one     A   -1.071918
               B    0.825303
       two     A    0.507340
               B   -1.517231
dtype: float64
  • “Stack된” 데이터프레임 또는 (MultiIndex를 인덱스로 사용하는) Series인 경우, stack()의 역 연산은 unstack()이며, 기본적으로 마지막 계층을 unstack 한다.
1
stacked.unstack()

A B
first second
bar one -0.827364 -1.346867
two -1.197194 -0.118960
baz one -1.071918 0.825303
two 0.507340 -1.517231

  <script>
    const buttonEl =
      document.querySelector('#df-55a6a6ce-eb1f-401f-8bcb-8246e05bc0dd button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-55a6a6ce-eb1f-401f-8bcb-8246e05bc0dd');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
stacked.unstack(1)

second one two
first
bar A -0.827364 -1.197194
B -1.346867 -0.118960
baz A -1.071918 0.507340
B 0.825303 -1.517231

  <script>
    const buttonEl =
      document.querySelector('#df-d48353b8-1fdf-416f-b815-fc90f9b22135 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-d48353b8-1fdf-416f-b815-fc90f9b22135');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
stacked.unstack(0)

first bar baz
second
one A -0.827364 -1.071918
B -1.346867 0.825303
two A -1.197194 0.507340
B -0.118960 -1.517231

  <script>
    const buttonEl =
      document.querySelector('#df-2305db67-99b3-4d0f-94cd-f423b56a95f6 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-2305db67-99b3-4d0f-94cd-f423b56a95f6');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Pivot Tables (피봇 테이블)

1
2
3
4
5
6
7
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})

df

A B C D E
0 one A foo -0.548983 0.943447
1 one B foo -0.400173 1.836891
2 two C foo 0.995067 0.029331
3 three A bar -0.693458 0.457755
4 one B bar 0.786452 -0.665671
5 one C bar -0.686570 -1.718177
6 two A foo 0.338070 0.163933
7 three B foo 1.793455 -0.410172
8 one C foo -0.271664 -0.857467
9 one A bar 0.979950 -1.324755
10 two B bar -0.689860 0.907164
11 three C bar -1.210862 -0.276602

  <script>
    const buttonEl =
      document.querySelector('#df-b66bc639-fd5d-4d5c-8180-1ed50d78e959 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-b66bc639-fd5d-4d5c-8180-1ed50d78e959');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 이 데이터로부터 피봇 테이블을 매우 쉽게 생성 가능하다.
1
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

C bar foo
A B
one A 0.979950 -0.548983
B 0.786452 -0.400173
C -0.686570 -0.271664
three A -0.693458 NaN
B NaN 1.793455
C -1.210862 NaN
two A NaN 0.338070
B -0.689860 NaN
C NaN 0.995067

  <script>
    const buttonEl =
      document.querySelector('#df-3e058eed-2670-412b-9bdb-87227d4add5d button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-3e058eed-2670-412b-9bdb-87227d4add5d');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

9.Time Series (시계열)

  • Pandas는 자주 일어나는 변환 (예시 : 5분마다 일어나는 데이터에 대한 2차 데이터 변환) 사이에 수행하는 리샘플링 연산을 위한 간단하고, 강력하며, 효율적인 함수를 제공.
  • 이는 재무 (금융) 응용에서 매우 일반적이지만 이에 국한되지는 않는다.
1
2
3
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min').sum()
2012-01-01    23654
Freq: 5T, dtype: int64
  • 시간대를 표현
1
2
3
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
2012-03-06   -0.480140
2012-03-07   -0.904772
2012-03-08    0.386809
2012-03-09    0.873791
2012-03-10    0.478778
Freq: D, dtype: float64
1
2
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-03-06 00:00:00+00:00   -0.480140
2012-03-07 00:00:00+00:00   -0.904772
2012-03-08 00:00:00+00:00    0.386809
2012-03-09 00:00:00+00:00    0.873791
2012-03-10 00:00:00+00:00    0.478778
Freq: D, dtype: float64
  • 다른 시간대로 변환한다.
1
ts_utc.tz_convert('US/Eastern')
2012-03-05 19:00:00-05:00   -0.480140
2012-03-06 19:00:00-05:00   -0.904772
2012-03-07 19:00:00-05:00    0.386809
2012-03-08 19:00:00-05:00    0.873791
2012-03-09 19:00:00-05:00    0.478778
Freq: D, dtype: float64
  • 시간 표현 <–> 기간 표현으로 변환한다.
1
2
3
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2012-01-31   -0.914418
2012-02-29   -0.077113
2012-03-31   -0.006825
2012-04-30    0.007167
2012-05-31   -0.733946
Freq: M, dtype: float64
1
2
ps = ts.to_period()
ps
2012-01   -0.914418
2012-02   -0.077113
2012-03   -0.006825
2012-04    0.007167
2012-05   -0.733946
Freq: M, dtype: float64
1
ps.to_timestamp()
2012-01-01   -0.914418
2012-02-01   -0.077113
2012-03-01   -0.006825
2012-04-01    0.007167
2012-05-01   -0.733946
Freq: MS, dtype: float64
  • 기간 <–> 시간 변환은 편리한 산술 기능들을 사용할 수 있도록 만들어준다.
  • 다음 예제에서, 11월에 끝나는 연말 결산의 분기별 빈도를 분기말 익월의 월말일 오전 9시로 변환한다.
1
2
3
4
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
1990-03-01 09:00   -0.685539
1990-06-01 09:00   -1.076153
1990-09-01 09:00    0.737103
1990-12-01 09:00   -1.115201
1991-03-01 09:00    0.680304
Freq: H, dtype: float64

10.Categoricals (범주화)

  • Pandas는 데이터프레임 내에 범주형 데이터를 포함할 수 있다.
1
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
  • 가공하지 않은 성적을 범주형 데이터로 변환
1
2
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

범주에 더 의미 있는 이름을 붙여야 한다. (Series.cat.categories로 할당하는 것이 적합)

1
2
df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']
  • 정렬은 사전 순서가 아닌, 해당 범주에서 지정된 순서대로 배열된다.
  • 131번에서 very bad, bad, medium, good, very good 의 순서로 기재되어 있기 때문에 정렬 결과도 해당 순서대로 배열.
1
df.sort_values(by="grade")

id raw_grade grade
0 1 a a
3 4 a a
4 5 a a
1 2 b b
2 3 b b
5 6 e e

  <script>
    const buttonEl =
      document.querySelector('#df-b7146b32-62f2-46d3-9fd0-73a739ee4d33 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-b7146b32-62f2-46d3-9fd0-73a739ee4d33');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 범주의 열을 기준으로 그룹화하면 빈 범주도 표시된다.
1
df.groupby("grade").size()
grade
a    3
b    2
e    1
dtype: int64

11.Plotting (그래프)

1
2
3
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f177f1ef3d0>

png

  • 데이터프레임에서 plot() 메소드는 라벨이 존재하는 모든 열을 그릴 때 편리하다.
1
2
3
4
5
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])

df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')
<matplotlib.legend.Legend at 0x7f177ebf3a50>




<Figure size 432x288 with 0 Axes>

png

12.Getting Data In / Out (데이터 입/출력)

CSV

  • csv 파일에 쓴다.
1
df.to_csv('foo.csv')
  • csv 파일을 읽어낸다.
1
pd.read_csv('foo.csv')

Unnamed: 0 A B C D
0 2000-01-01 0.005390 -0.616651 2.261198 -0.868199
1 2000-01-02 -0.084304 -0.247153 0.097660 -0.381440
2 2000-01-03 1.540081 0.806761 0.628394 -0.810376
3 2000-01-04 2.339388 0.573873 2.907442 0.339424
4 2000-01-05 0.938390 2.164131 3.848056 0.158632
... ... ... ... ... ...
995 2002-09-22 -4.732994 38.365117 10.155791 -34.795312
996 2002-09-23 -7.505606 38.661194 10.837375 -35.956062
997 2002-09-24 -4.967844 37.522602 10.977005 -35.639584
998 2002-09-25 -3.707181 35.950703 11.191352 -36.306747
999 2002-09-26 -1.984682 36.604786 10.741370 -35.995049

1000 rows × 5 columns

  <script>
    const buttonEl =
      document.querySelector('#df-f2acd47f-3ad2-400d-b1c5-ff10dfa1d025 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-f2acd47f-3ad2-400d-b1c5-ff10dfa1d025');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

HDF5

  • HDFStores에 읽고 쓴다.
1
df.to_hdf('foo.h5','df')
  • HDF5 Store에서 읽어온다.
1
pd.read_hdf('foo.h5', 'df')

A B C D
2000-01-01 0.005390 -0.616651 2.261198 -0.868199
2000-01-02 -0.084304 -0.247153 0.097660 -0.381440
2000-01-03 1.540081 0.806761 0.628394 -0.810376
2000-01-04 2.339388 0.573873 2.907442 0.339424
2000-01-05 0.938390 2.164131 3.848056 0.158632
... ... ... ... ...
2002-09-22 -4.732994 38.365117 10.155791 -34.795312
2002-09-23 -7.505606 38.661194 10.837375 -35.956062
2002-09-24 -4.967844 37.522602 10.977005 -35.639584
2002-09-25 -3.707181 35.950703 11.191352 -36.306747
2002-09-26 -1.984682 36.604786 10.741370 -35.995049

1000 rows × 4 columns

  <script>
    const buttonEl =
      document.querySelector('#df-d8254976-8070-4b22-bdc3-11fbbd746968 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-d8254976-8070-4b22-bdc3-11fbbd746968');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Excel

  • MS Excel에 읽고 쓴다.

  • 엑셀 파일에 쓴다.

1
df.to_excel('foo.xlsx', sheet_name='Sheet1')
  • 엑셀 파일을 읽어온다.
1
pd.read_excel('foo.xlsx', 'Sheet1', index_col = None, na_values=['NA'])

Unnamed: 0 A B C D
0 2000-01-01 0.005390 -0.616651 2.261198 -0.868199
1 2000-01-02 -0.084304 -0.247153 0.097660 -0.381440
2 2000-01-03 1.540081 0.806761 0.628394 -0.810376
3 2000-01-04 2.339388 0.573873 2.907442 0.339424
4 2000-01-05 0.938390 2.164131 3.848056 0.158632
... ... ... ... ... ...
995 2002-09-22 -4.732994 38.365117 10.155791 -34.795312
996 2002-09-23 -7.505606 38.661194 10.837375 -35.956062
997 2002-09-24 -4.967844 37.522602 10.977005 -35.639584
998 2002-09-25 -3.707181 35.950703 11.191352 -36.306747
999 2002-09-26 -1.984682 36.604786 10.741370 -35.995049

1000 rows × 5 columns

  <script>
    const buttonEl =
      document.querySelector('#df-0adace20-cbb2-4908-846b-7f1dd49ea7cb button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-0adace20-cbb2-4908-846b-7f1dd49ea7cb');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

13.Gotchas (잡았다!)

  • 연산 수행 시 다음과 같은 예외 상황(Error)을 볼 수도 있다.
1
2
if pd.Series([False, True, False]):
print("I was true")
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-129-5c782b38cd2f> in <module>()
----> 1 if pd.Series([False, True, False]):
      2     print("I was true")


/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __nonzero__(self)
   1536     def __nonzero__(self):
   1537         raise ValueError(
-> 1538             f"The truth value of a {type(self).__name__} is ambiguous. "
   1539             "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
   1540         )


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
  • 이런 경우에는 any(), all(), empty 등을 사용해서 무엇을 원하는지를 선택 (반영)해주어야 한다.
1
2
if pd.Series([False, True, False])is not None:
print("I was not None")
I was not None

End of document

pandas_tutorial_01

데이터 전처리

  • 데이터 전처리의 기본
    • garbage Data —[ Great Model ]—> Garbage Results

데이터 전처리의 주요 과정

  • 데이터 전처리 수행 프로세스***(중요)***
    • 1.중복값 제거 및 결측치 처리 -> 2.이상치 처리 -> 3.Feature Engineering
    • 주요 목적 : 더 나은 분석 결과 도출 및 모형 성능 개선 실현

pandas 기본자료형

  • 숫자, 문자 모두 들어간다.
  • 중복이 불가능하다.

판다스

라이브러리 불러오기

1
2
import pandas as pd
print(pd.__version__)
1.3.5

테스트

1
2
3
4
5
temp_dic = {"col1": [1,2,3],
"col2": [3, 4, 5]}

df = pd.DataFrame(temp_dic) # DataFrame() 사용
df

col1 col2
0 1 3
1 2 4
2 3 5

  <script>
    const buttonEl =
      document.querySelector('#df-5f12a67d-363f-495e-a8d6-e15402e0c5d6 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-5f12a67d-363f-495e-a8d6-e15402e0c5d6');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
print(type(df))
<class 'pandas.core.frame.DataFrame'>
1
2
3
4
temp_dic = {'a':1, 'b':2, 'c':3}
ser = pd.Series(temp_dic) # Series() 사용
print(ser)
print(type(ser))
a    1
b    2
c    3
dtype: int64
<class 'pandas.core.series.Series'>

구글 드라이브 연동

  • 구글 드라이브 → colab notebook → 새 폴더 생성 : data → 슬랙에서 다운 받은 lemonade.csv 파일을 올린다 -> 다음 코드를 실행
1
2
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Mounted at ..drive 가 출력되었으므로 성공

현재 좌측에 폴더 그림 -> drive -> mydrive -> Colab Notebooks -> data -> Lemonade2016.csv를 찾아서 우클릭 -> 경로 복사 -> 다음 코드에 붙여넣어 사용

1
2
3
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/Lemonade2016.csv'
juice = pd.read_csv(DATA_PATH)
juice

Date Location Lemon Orange Temperature Leaflets Price
0 7/1/2016 Park 97 67 70 90.0 0.25
1 7/2/2016 Park 98 67 72 90.0 0.25
2 7/3/2016 Park 110 77 71 104.0 0.25
3 7/4/2016 Beach 134 99 76 98.0 0.25
4 7/5/2016 Beach 159 118 78 135.0 0.25
5 7/6/2016 Beach 103 69 82 90.0 0.25
6 7/6/2016 Beach 103 69 82 90.0 0.25
7 7/7/2016 Beach 143 101 81 135.0 0.25
8 NaN Beach 123 86 82 113.0 0.25
9 7/9/2016 Beach 134 95 80 126.0 0.25
10 7/10/2016 Beach 140 98 82 131.0 0.25
11 7/11/2016 Beach 162 120 83 135.0 0.25
12 7/12/2016 Beach 130 95 84 99.0 0.25
13 7/13/2016 Beach 109 75 77 99.0 0.25
14 7/14/2016 Beach 122 85 78 113.0 0.25
15 7/15/2016 Beach 98 62 75 108.0 0.50
16 7/16/2016 Beach 81 50 74 90.0 0.50
17 7/17/2016 Beach 115 76 77 126.0 0.50
18 7/18/2016 Park 131 92 81 122.0 0.50
19 7/19/2016 Park 122 85 78 113.0 0.50
20 7/20/2016 Park 71 42 70 NaN 0.50
21 7/21/2016 Park 83 50 77 90.0 0.50
22 7/22/2016 Park 112 75 80 108.0 0.50
23 7/23/2016 Park 120 82 81 117.0 0.50
24 7/24/2016 Park 121 82 82 117.0 0.50
25 7/25/2016 Park 156 113 84 135.0 0.50
26 7/26/2016 Park 176 129 83 158.0 0.35
27 7/27/2016 Park 104 68 80 99.0 0.35
28 7/28/2016 Park 96 63 82 90.0 0.35
29 7/29/2016 Park 100 66 81 95.0 0.35
30 7/30/2016 Beach 88 57 82 81.0 0.35
31 7/31/2016 Beach 76 47 82 68.0 0.35

  <script>
    const buttonEl =
      document.querySelector('#df-e4ed5b94-20e7-42ba-9f65-459f54e1728a button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-e4ed5b94-20e7-42ba-9f65-459f54e1728a');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • 데이터를 불러왔다.
  • 첫번째 파악해야 하는 것!
    • 데이터 구조를 파악해보자
1
juice.info()  # 데이터 구조 파악
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         31 non-null     object 
 1   Location     32 non-null     object 
 2   Lemon        32 non-null     int64  
 3   Orange       32 non-null     int64  
 4   Temperature  32 non-null     int64  
 5   Leaflets     31 non-null     float64
 6   Price        32 non-null     float64
dtypes: float64(2), int64(3), object(2)
memory usage: 1.9+ KB
1
juice.head()  # 상위의 데이터를 여러개 불러온다. 디폴트 값이 5개.

Date Location Lemon Orange Temperature Leaflets Price
0 7/1/2016 Park 97 67 70 90.0 0.25
1 7/2/2016 Park 98 67 72 90.0 0.25
2 7/3/2016 Park 110 77 71 104.0 0.25
3 7/4/2016 Beach 134 99 76 98.0 0.25
4 7/5/2016 Beach 159 118 78 135.0 0.25

  <script>
    const buttonEl =
      document.querySelector('#df-c3238942-1033-4010-80b8-10e94c66dc23 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-c3238942-1033-4010-80b8-10e94c66dc23');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
juice.tail()  # 하위의 데이터를 여러개 불러온다. 디폴트 값이 5개

Date Location Lemon Orange Temperature Leaflets Price
27 7/27/2016 Park 104 68 80 99.0 0.35
28 7/28/2016 Park 96 63 82 90.0 0.35
29 7/29/2016 Park 100 66 81 95.0 0.35
30 7/30/2016 Beach 88 57 82 81.0 0.35
31 7/31/2016 Beach 76 47 82 68.0 0.35

  <script>
    const buttonEl =
      document.querySelector('#df-cc60af2a-dd96-48c1-9398-546b4a947c77 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-cc60af2a-dd96-48c1-9398-546b4a947c77');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • Describe() 함수
    • 기술통계량 확인해주는 함수
1
juice.describe()

Lemon Orange Temperature Leaflets Price
count 32.000000 32.000000 32.000000 31.000000 32.000000
mean 116.156250 80.000000 78.968750 108.548387 0.354687
std 25.823357 21.863211 4.067847 20.117718 0.113137
min 71.000000 42.000000 70.000000 68.000000 0.250000
25% 98.000000 66.750000 77.000000 90.000000 0.250000
50% 113.500000 76.500000 80.500000 108.000000 0.350000
75% 131.750000 95.000000 82.000000 124.000000 0.500000
max 176.000000 129.000000 84.000000 158.000000 0.500000

  <script>
    const buttonEl =
      document.querySelector('#df-bfd69db7-f9d2-49ea-84ed-2989ca9e02a8 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-bfd69db7-f9d2-49ea-84ed-2989ca9e02a8');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
type(juice.describe())  # Describe함수 결과물의 타입은 DataFrame 이다.
pandas.core.frame.DataFrame
  • value_counts()
1
2
print(juice['Location'].value_counts())
print(type(juice['Location'].value_counts()))
Beach    17
Park     15
Name: Location, dtype: int64
<class 'pandas.core.series.Series'>

데이터 다뤄보기

  • 행과 열을 핸들링 해보자.
1
2
juice['Sold'] = 0   # sold 열 추가.
print(juice.head(3))
       Date Location  Lemon  Orange  Temperature  Leaflets  Price  Sold
0  7/1/2016     Park     97      67           70      90.0   0.25     0
1  7/2/2016     Park     98      67           72      90.0   0.25     0
2  7/3/2016     Park    110      77           71     104.0   0.25     0
1
2
juice['Sold'] = juice['Lemon'] + juice['Orange']  # Sold에 값 설정
print(juice.head(3))
       Date Location  Lemon  Orange  Temperature  Leaflets  Price  Sold
0  7/1/2016     Park     97      67           70      90.0   0.25   164
1  7/2/2016     Park     98      67           72      90.0   0.25   165
2  7/3/2016     Park    110      77           71     104.0   0.25   187
  • 매출액 = 가격 x 판매량
  • Revenue 로 만들어보자
1
2
juice['Revenue'] = juice['Sold'] * juice['Price']
print(juice.head(3))
       Date Location  Lemon  Orange  Temperature  Leaflets  Price  Sold  \
0  7/1/2016     Park     97      67           70      90.0   0.25   164   
1  7/2/2016     Park     98      67           72      90.0   0.25   165   
2  7/3/2016     Park    110      77           71     104.0   0.25   187   

   Revenue  
0    41.00  
1    41.25  
2    46.75  
  • drop(axis = 0|1)
    • axis는 ‘축’을 의미한다. 한 축을 따라서 연산이 된다.
    • axis를 0으로 설정 시, 행(=index)방향으로 drop() 실행
    • axis를 1로 설정 시, 열방향으로 drop 수행함.
1
2
juice_column_drop = juice.drop('Sold', axis = 1)
print(juice_column_drop.head(3))
       Date Location  Lemon  Orange  Temperature  Leaflets  Price  Revenue
0  7/1/2016     Park     97      67           70      90.0   0.25    41.00
1  7/2/2016     Park     98      67           72      90.0   0.25    41.25
2  7/3/2016     Park    110      77           71     104.0   0.25    46.75
1
2
juice_row_drop = juice.drop(0, axis = 0)
print(juice_row_drop.head(3))
       Date Location  Lemon  Orange  Temperature  Leaflets  Price  Sold  \
1  7/2/2016     Park     98      67           72      90.0   0.25   165   
2  7/3/2016     Park    110      77           71     104.0   0.25   187   
3  7/4/2016    Beach    134      99           76      98.0   0.25   233   

   Revenue  
1    41.25  
2    46.75  
3    58.25  

데이터 인덱싱

1
juice[0:5] 

Date Location Lemon Orange Temperature Leaflets Price Sold Revenue
0 7/1/2016 Park 97 67 70 90.0 0.25 164 41.00
1 7/2/2016 Park 98 67 72 90.0 0.25 165 41.25
2 7/3/2016 Park 110 77 71 104.0 0.25 187 46.75
3 7/4/2016 Beach 134 99 76 98.0 0.25 233 58.25
4 7/5/2016 Beach 159 118 78 135.0 0.25 277 69.25

  <script>
    const buttonEl =
      document.querySelector('#df-bfa3fabe-e933-4527-879f-12c188c0b8bd button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-bfa3fabe-e933-4527-879f-12c188c0b8bd');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

boolean 값을 활용한 데이터 추출

1
2
3
# location이 Beach인 경우
# juice['Location'].value_counts()
juice[juice['Location'] == "Beach"]

Date Location Lemon Orange Temperature Leaflets Price Sold Revenue
3 7/4/2016 Beach 134 99 76 98.0 0.25 233 58.25
4 7/5/2016 Beach 159 118 78 135.0 0.25 277 69.25
5 7/6/2016 Beach 103 69 82 90.0 0.25 172 43.00
6 7/6/2016 Beach 103 69 82 90.0 0.25 172 43.00
7 7/7/2016 Beach 143 101 81 135.0 0.25 244 61.00
8 NaN Beach 123 86 82 113.0 0.25 209 52.25
9 7/9/2016 Beach 134 95 80 126.0 0.25 229 57.25
10 7/10/2016 Beach 140 98 82 131.0 0.25 238 59.50
11 7/11/2016 Beach 162 120 83 135.0 0.25 282 70.50
12 7/12/2016 Beach 130 95 84 99.0 0.25 225 56.25
13 7/13/2016 Beach 109 75 77 99.0 0.25 184 46.00
14 7/14/2016 Beach 122 85 78 113.0 0.25 207 51.75
15 7/15/2016 Beach 98 62 75 108.0 0.50 160 80.00
16 7/16/2016 Beach 81 50 74 90.0 0.50 131 65.50
17 7/17/2016 Beach 115 76 77 126.0 0.50 191 95.50
30 7/30/2016 Beach 88 57 82 81.0 0.35 145 50.75
31 7/31/2016 Beach 76 47 82 68.0 0.35 123 43.05

  <script>
    const buttonEl =
      document.querySelector('#df-f23f5092-ba57-4126-bdd5-ecc3581c90cd button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-f23f5092-ba57-4126-bdd5-ecc3581c90cd');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
# location이 Beach인 경우
# juice['Location'].value_counts()
juice[juice['Leaflets'] >= 100]

Date Location Lemon Orange Temperature Leaflets Price Sold Revenue
2 7/3/2016 Park 110 77 71 104.0 0.25 187 46.75
4 7/5/2016 Beach 159 118 78 135.0 0.25 277 69.25
7 7/7/2016 Beach 143 101 81 135.0 0.25 244 61.00
8 NaN Beach 123 86 82 113.0 0.25 209 52.25
9 7/9/2016 Beach 134 95 80 126.0 0.25 229 57.25
10 7/10/2016 Beach 140 98 82 131.0 0.25 238 59.50
11 7/11/2016 Beach 162 120 83 135.0 0.25 282 70.50
14 7/14/2016 Beach 122 85 78 113.0 0.25 207 51.75
15 7/15/2016 Beach 98 62 75 108.0 0.50 160 80.00
17 7/17/2016 Beach 115 76 77 126.0 0.50 191 95.50
18 7/18/2016 Park 131 92 81 122.0 0.50 223 111.50
19 7/19/2016 Park 122 85 78 113.0 0.50 207 103.50
22 7/22/2016 Park 112 75 80 108.0 0.50 187 93.50
23 7/23/2016 Park 120 82 81 117.0 0.50 202 101.00
24 7/24/2016 Park 121 82 82 117.0 0.50 203 101.50
25 7/25/2016 Park 156 113 84 135.0 0.50 269 134.50
26 7/26/2016 Park 176 129 83 158.0 0.35 305 106.75

  <script>
    const buttonEl =
      document.querySelector('#df-080b31c4-9d87-4d46-a98d-5d6eec44b68f button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-080b31c4-9d87-4d46-a98d-5d6eec44b68f');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

iloc vs loc

  • 차이를 확인한다!
1
2
3
juice.head(3)
# index 번호는 다음 실행 결과에서
# 0 1 2 3 4 5 6 7 8

Date Location Lemon Orange Temperature Leaflets Price Sold Revenue
0 7/1/2016 Park 97 67 70 90.0 0.25 164 41.00
1 7/2/2016 Park 98 67 72 90.0 0.25 165 41.25
2 7/3/2016 Park 110 77 71 104.0 0.25 187 46.75

  <script>
    const buttonEl =
      document.querySelector('#df-dadb1a11-c681-42a1-9b8b-85510d760ea0 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-dadb1a11-c681-42a1-9b8b-85510d760ea0');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

%%time

  • 실행 시간 측정
  • 코드의 효율을 살펴보자
1
2
3
%%time

juice.iloc[0:3, 0:2] # 인덱스 기반
CPU times: user 2.14 ms, sys: 0 ns, total: 2.14 ms
Wall time: 3.19 ms

Date Location
0 7/1/2016 Park
1 7/2/2016 Park
2 7/3/2016 Park

  <script>
    const buttonEl =
      document.querySelector('#df-dfd7fd78-f2b8-491f-a422-bd0e37bc0297 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-dfd7fd78-f2b8-491f-a422-bd0e37bc0297');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
  • loc
  • -> 라벨 기반
1
2
3
%%time

juice.loc[0:2, ['Date', 'Location']] # 라벨 기반이라 0:2로 실행 시, 3개 생성된다
CPU times: user 1.64 ms, sys: 0 ns, total: 1.64 ms
Wall time: 1.62 ms

Date Location
0 7/1/2016 Park
1 7/2/2016 Park
2 7/3/2016 Park

  <script>
    const buttonEl =
      document.querySelector('#df-f63e27df-425e-4e52-acd7-6c213c3c886a button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-f63e27df-425e-4e52-acd7-6c213c3c886a');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
# juice[juice['Leaflets'] >= 100, 컬럼명 별도 추출]

juice.loc[juice['Leaflets'] >= 100, ['Date', 'Location']] # 컬럼은 컬럼별로 추출

Date Location
2 7/3/2016 Park
4 7/5/2016 Beach
7 7/7/2016 Beach
8 NaN Beach
9 7/9/2016 Beach
10 7/10/2016 Beach
11 7/11/2016 Beach
14 7/14/2016 Beach
15 7/15/2016 Beach
17 7/17/2016 Beach
18 7/18/2016 Park
19 7/19/2016 Park
22 7/22/2016 Park
23 7/23/2016 Park
24 7/24/2016 Park
25 7/25/2016 Park
26 7/26/2016 Park

  <script>
    const buttonEl =
      document.querySelector('#df-80c7ea4c-f3b6-416c-90a6-935ca4d10c87 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-80c7ea4c-f3b6-416c-90a6-935ca4d10c87');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
juice.iloc[juice['Leaflets'] >= 100, 0:2]

정렬

  • sort_values()
1
2
# 매출액 순서로 정렬
juice.sort_values(by=['Revenue'], ascending=False).head(3)

Date Location Lemon Orange Temperature Leaflets Price Sold Revenue
25 7/25/2016 Park 156 113 84 135.0 0.50 269 134.50
18 7/18/2016 Park 131 92 81 122.0 0.50 223 111.50
26 7/26/2016 Park 176 129 83 158.0 0.35 305 106.75

  <script>
    const buttonEl =
      document.querySelector('#df-d4ef77c6-8bab-4eae-9f75-40bfaf70f3b7 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-d4ef77c6-8bab-4eae-9f75-40bfaf70f3b7');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
4
5
6
# 2개 이상 기준으로 할 경우, 그룹화하여 정렬됨
juice.sort_values(by=['Price', 'Temperature'], ascending=False)

#이것도 가능
#juice.sort_values(by=['Price', 'Temperature'], ascending=[False, True]) -> Price는 내림차순, Temparature는 오름차순
#juice.sort_values(by=['Price', 'Temperature'], ascending=[False, True]).rest_index(drop=True) -> index 번호도 재정렬

Date Location Lemon Orange Temperature Leaflets Price Sold Revenue
25 7/25/2016 Park 156 113 84 135.0 0.50 269 134.50
24 7/24/2016 Park 121 82 82 117.0 0.50 203 101.50
18 7/18/2016 Park 131 92 81 122.0 0.50 223 111.50
23 7/23/2016 Park 120 82 81 117.0 0.50 202 101.00
22 7/22/2016 Park 112 75 80 108.0 0.50 187 93.50
19 7/19/2016 Park 122 85 78 113.0 0.50 207 103.50
17 7/17/2016 Beach 115 76 77 126.0 0.50 191 95.50
21 7/21/2016 Park 83 50 77 90.0 0.50 133 66.50
15 7/15/2016 Beach 98 62 75 108.0 0.50 160 80.00
16 7/16/2016 Beach 81 50 74 90.0 0.50 131 65.50
20 7/20/2016 Park 71 42 70 NaN 0.50 113 56.50
26 7/26/2016 Park 176 129 83 158.0 0.35 305 106.75
28 7/28/2016 Park 96 63 82 90.0 0.35 159 55.65
30 7/30/2016 Beach 88 57 82 81.0 0.35 145 50.75
31 7/31/2016 Beach 76 47 82 68.0 0.35 123 43.05
29 7/29/2016 Park 100 66 81 95.0 0.35 166 58.10
27 7/27/2016 Park 104 68 80 99.0 0.35 172 60.20
12 7/12/2016 Beach 130 95 84 99.0 0.25 225 56.25
11 7/11/2016 Beach 162 120 83 135.0 0.25 282 70.50
5 7/6/2016 Beach 103 69 82 90.0 0.25 172 43.00
6 7/6/2016 Beach 103 69 82 90.0 0.25 172 43.00
8 NaN Beach 123 86 82 113.0 0.25 209 52.25
10 7/10/2016 Beach 140 98 82 131.0 0.25 238 59.50
7 7/7/2016 Beach 143 101 81 135.0 0.25 244 61.00
9 7/9/2016 Beach 134 95 80 126.0 0.25 229 57.25
4 7/5/2016 Beach 159 118 78 135.0 0.25 277 69.25
14 7/14/2016 Beach 122 85 78 113.0 0.25 207 51.75
13 7/13/2016 Beach 109 75 77 99.0 0.25 184 46.00
3 7/4/2016 Beach 134 99 76 98.0 0.25 233 58.25
1 7/2/2016 Park 98 67 72 90.0 0.25 165 41.25
2 7/3/2016 Park 110 77 71 104.0 0.25 187 46.75
0 7/1/2016 Park 97 67 70 90.0 0.25 164 41.00

  <script>
    const buttonEl =
      document.querySelector('#df-14051fd5-627b-4ebe-ab05-3415f55cc7f3 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-14051fd5-627b-4ebe-ab05-3415f55cc7f3');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>

Group by

  • R dplyr groupby() %>% summarize()
  • -> 데이터 요약 -> 엑셀로 피벗 테이블
1
2
# Location 항목을 카운트
juice.groupby(by = 'Location').count()

Date Lemon Orange Temperature Leaflets Price Sold Revenue
Location
Beach 16 17 17 17 17 17 17 17
Park 15 15 15 15 14 15 15 15

  <script>
    const buttonEl =
      document.querySelector('#df-26afeca1-6bb7-494f-ba2d-92aab015b058 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-26afeca1-6bb7-494f-ba2d-92aab015b058');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
4
# 집계 함수
import numpy as np

juice.groupby(['Location'])['Revenue'].agg([max, min, sum, np.mean])

max min sum mean
Location
Beach 95.5 43.0 1002.8 58.988235
Park 134.5 41.0 1178.2 78.546667

  <script>
    const buttonEl =
      document.querySelector('#df-650575f1-c764-4097-b860-3fa1b26021c5 button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-650575f1-c764-4097-b860-3fa1b26021c5');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>
1
2
3
4
# 집계 함수
import numpy as np

juice.groupby(['Location'])['Revenue', 'Lemon'].agg([max, min, sum, np.mean])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  after removing the cwd from sys.path.

Revenue Lemon
max min sum mean max min sum mean
Location
Beach 95.5 43.0 1002.8 58.988235 162 76 2020 118.823529
Park 134.5 41.0 1178.2 78.546667 176 71 1697 113.133333

  <script>
    const buttonEl =
      document.querySelector('#df-7a3b6989-de2d-4a76-8bd8-66538dc5863c button.colab-df-convert');
    buttonEl.style.display =
      google.colab.kernel.accessAllowed ? 'block' : 'none';

    async function convertToInteractive(key) {
      const element = document.querySelector('#df-7a3b6989-de2d-4a76-8bd8-66538dc5863c');
      const dataTable =
        await google.colab.kernel.invokeFunction('convertToInteractive',
                                                 [key], {});
      if (!dataTable) return;

      const docLinkHtml = 'Like what you see? Visit the ' +
        '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
        + ' to learn more about interactive tables.';
      element.innerHTML = '';
      dataTable['output_type'] = 'display_data';
      await google.colab.output.renderOutput(dataTable, element);
      const docLink = document.createElement('div');
      docLink.innerHTML = docLinkHtml;
      element.appendChild(docLink);
    }
  </script>
</div>