데이터 불러오기
In [ ]:
import pandas as pd
AAPL = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/AAPL.csv', index_col=0)
AAPL.head()
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-032012-01-042012-01-052012-01-062012-01-09
14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 |
14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 |
14.819643 | 14.948214 | 14.738214 | 14.929643 | 12.672229 | 271269600 |
14.991786 | 15.098214 | 14.972143 | 15.085714 | 12.804703 | 318292800 |
15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
In [ ]:
AAPL.tail()
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2023-08-212023-08-222023-08-232023-08-242023-08-25
175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
데이터 추출하기
In [ ]:
# AAPL['Close'] >= 150 --> return Boolean type
AAPL[AAPL['Close'] >= 150] # return True value
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2021-08-162021-08-172021-08-302021-08-312021-09-01...2023-08-212023-08-222023-08-232023-08-242023-08-25
148.539993 | 151.190002 | 146.470001 | 151.119995 | 149.391647 | 103296000 |
150.229996 | 151.679993 | 149.089996 | 150.190002 | 148.472290 | 92229700 |
149.000000 | 153.490005 | 148.610001 | 153.119995 | 151.368774 | 90956700 |
152.660004 | 152.800003 | 151.289993 | 151.830002 | 150.093552 | 86453100 |
152.830002 | 154.979996 | 152.339996 | 152.509995 | 150.765762 | 80313700 |
... | ... | ... | ... | ... | ... |
175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
333 rows × 6 columns
In [ ]:
# & : and
# | : or
AAPL[(AAPL['Close'] >= 150) & (AAPL['Open'] >= 160)]
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2021-11-222021-11-232021-11-242021-12-012021-12-03...2023-08-212023-08-222023-08-232023-08-242023-08-25
161.679993 | 165.699997 | 161.000000 | 161.020004 | 159.410767 | 117467900 |
161.119995 | 161.800003 | 159.059998 | 161.410004 | 159.796844 | 96041900 |
160.750000 | 162.139999 | 159.639999 | 161.940002 | 160.321533 | 69463600 |
167.479996 | 170.300003 | 164.529999 | 164.770004 | 163.123276 | 152052500 |
164.020004 | 164.960007 | 159.720001 | 161.839996 | 160.222549 | 118023100 |
... | ... | ... | ... | ... | ... |
175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
224 rows × 6 columns
In [ ]:
AAPL.query('Volume <= 75000000') # Volume : column
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2014-12-242015-11-272015-12-242016-08-122016-11-25...2023-08-212023-08-222023-08-232023-08-242023-08-25
28.145000 | 28.177500 | 28.002501 | 28.002501 | 25.099218 | 57918400 |
29.572500 | 29.602501 | 29.400000 | 29.452499 | 26.848679 | 52185600 |
27.250000 | 27.250000 | 26.987499 | 27.007500 | 24.619831 | 54281600 |
26.945000 | 27.110001 | 26.945000 | 27.045000 | 25.073812 | 74641600 |
27.782499 | 27.967501 | 27.737499 | 27.947500 | 26.043564 | 45903600 |
... | ... | ... | ... | ... | ... |
175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
417 rows × 6 columns
In [ ]:
AAPL.query("Date > '2023-01-01'")
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2023-01-032023-01-042023-01-052023-01-062023-01-09...2023-08-212023-08-222023-08-232023-08-242023-08-25
130.279999 | 130.899994 | 124.169998 | 125.070000 | 124.538658 | 112117500 |
126.889999 | 128.660004 | 125.080002 | 126.360001 | 125.823189 | 89113600 |
127.129997 | 127.769997 | 124.760002 | 125.019997 | 124.488869 | 80962700 |
126.010002 | 130.289993 | 124.889999 | 129.619995 | 129.069336 | 87754700 |
130.470001 | 133.410004 | 129.889999 | 130.149994 | 129.597076 | 70790800 |
... | ... | ... | ... | ... | ... |
175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
163 rows × 6 columns
In [ ]:
AAPL.query('Open > Close') # column 비교 가능
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-092012-01-102012-01-112012-01-122012-01-19...2023-08-102023-08-152023-08-162023-08-172023-08-24
15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
15.211071 | 15.214286 | 15.053571 | 15.115714 | 12.830169 | 258196400 |
15.095714 | 15.101786 | 14.975357 | 15.091071 | 12.809250 | 215084800 |
15.081429 | 15.103571 | 14.955357 | 15.049643 | 12.774082 | 212587200 |
15.362500 | 15.406071 | 15.232500 | 15.276786 | 12.966883 | 261738400 |
... | ... | ... | ... | ... | ... |
179.479996 | 180.750000 | 177.600006 | 177.970001 | 177.729996 | 54686900 |
178.880005 | 179.479996 | 177.050003 | 177.449997 | 177.449997 | 43622600 |
177.130005 | 178.539993 | 176.500000 | 176.570007 | 176.570007 | 46964900 |
177.139999 | 177.509995 | 173.479996 | 174.000000 | 174.000000 | 66062900 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
1388 rows × 6 columns
In [ ]:
AAPL[AAPL['Volume'].isin([302220800, 260022000])]
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-032012-01-04
14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 |
14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 |
In [ ]:
AAPL.query("Volume in [302220800, 260022000]")
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-032012-01-04
14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 |
14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 |
In [ ]:
AAPL.query('(Close >= 150) and (Open >= 160)')
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2021-11-222021-11-232021-11-242021-12-012021-12-03...2023-08-212023-08-222023-08-232023-08-242023-08-25
161.679993 | 165.699997 | 161.000000 | 161.020004 | 159.410767 | 117467900 |
161.119995 | 161.800003 | 159.059998 | 161.410004 | 159.796844 | 96041900 |
160.750000 | 162.139999 | 159.639999 | 161.940002 | 160.321533 | 69463600 |
167.479996 | 170.300003 | 164.529999 | 164.770004 | 163.123276 | 152052500 |
164.020004 | 164.960007 | 159.720001 | 161.839996 | 160.222549 | 118023100 |
... | ... | ... | ... | ... | ... |
175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
224 rows × 6 columns
In [ ]:
AAPL.query('Volume == 69463600')
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2021-11-24
160.75 | 162.139999 | 159.639999 | 161.940002 | 160.321533 | 69463600 |
In [ ]:
sample_volume = 69463600
AAPL.query('Volume == @sample_volume') # @ 변수 활용 가능
# AAPL.query(f'Volume == {sample_volume}') # 당연히 f-String 가능
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2021-11-24
160.75 | 162.139999 | 159.639999 | 161.940002 | 160.321533 | 69463600 |
In [ ]:
AAPL.reset_index() # inplace=False 기본값
Out[ ]:
DateOpenHighLowCloseAdj CloseVolume01234...29262927292829292930
2012-01-03 | 14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 |
2012-01-04 | 14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 |
2012-01-05 | 14.819643 | 14.948214 | 14.738214 | 14.929643 | 12.672229 | 271269600 |
2012-01-06 | 14.991786 | 15.098214 | 14.972143 | 15.085714 | 12.804703 | 318292800 |
2012-01-09 | 15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
... | ... | ... | ... | ... | ... | ... |
2023-08-21 | 175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 |
2023-08-22 | 177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 |
2023-08-23 | 178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 |
2023-08-24 | 180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 |
2023-08-25 | 177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 |
2931 rows × 7 columns
Q. Volumne이 평균 이상이면서 Open이 Close 보다 높은 데이터를 선택한 데이터프레임을 가져와주세요.
In [ ]:
AAPL[(AAPL['Volume'] >= AAPL['Volume'].mean()) & (AAPL['Open'] >= AAPL['Close'])]
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-092012-01-102012-01-112012-01-122012-01-19...2020-09-022020-09-032020-09-082020-09-182020-10-13
15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
15.211071 | 15.214286 | 15.053571 | 15.115714 | 12.830169 | 258196400 |
15.095714 | 15.101786 | 14.975357 | 15.091071 | 12.809250 | 215084800 |
15.081429 | 15.103571 | 14.955357 | 15.049643 | 12.774082 | 212587200 |
15.362500 | 15.406071 | 15.232500 | 15.276786 | 12.966883 | 261738400 |
... | ... | ... | ... | ... | ... |
137.589996 | 137.979996 | 127.000000 | 131.399994 | 129.067047 | 200119000 |
126.910004 | 128.839996 | 120.500000 | 120.879997 | 118.733826 | 257599600 |
113.949997 | 118.989998 | 112.680000 | 112.820000 | 110.816933 | 231366600 |
110.400002 | 110.879997 | 106.089996 | 106.839996 | 104.943100 | 287104900 |
125.269997 | 125.389999 | 119.650002 | 121.099998 | 118.949928 | 262330500 |
484 rows × 6 columns
In [ ]:
con1 = AAPL['Volume'] >= AAPL['Volume'].mean()
con2 = AAPL['Open'] >= AAPL['Close']
AAPL[con1 & con2]
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-092012-01-102012-01-112012-01-122012-01-19...2020-09-022020-09-032020-09-082020-09-182020-10-13
15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
15.211071 | 15.214286 | 15.053571 | 15.115714 | 12.830169 | 258196400 |
15.095714 | 15.101786 | 14.975357 | 15.091071 | 12.809250 | 215084800 |
15.081429 | 15.103571 | 14.955357 | 15.049643 | 12.774082 | 212587200 |
15.362500 | 15.406071 | 15.232500 | 15.276786 | 12.966883 | 261738400 |
... | ... | ... | ... | ... | ... |
137.589996 | 137.979996 | 127.000000 | 131.399994 | 129.067047 | 200119000 |
126.910004 | 128.839996 | 120.500000 | 120.879997 | 118.733826 | 257599600 |
113.949997 | 118.989998 | 112.680000 | 112.820000 | 110.816933 | 231366600 |
110.400002 | 110.879997 | 106.089996 | 106.839996 | 104.943100 | 287104900 |
125.269997 | 125.389999 | 119.650002 | 121.099998 | 118.949928 | 262330500 |
484 rows × 6 columns
In [ ]:
AAPL.query('(Volume >= Volume.mean()) and (Open >= Close)')
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-092012-01-102012-01-112012-01-122012-01-19...2020-09-022020-09-032020-09-082020-09-182020-10-13
15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
15.211071 | 15.214286 | 15.053571 | 15.115714 | 12.830169 | 258196400 |
15.095714 | 15.101786 | 14.975357 | 15.091071 | 12.809250 | 215084800 |
15.081429 | 15.103571 | 14.955357 | 15.049643 | 12.774082 | 212587200 |
15.362500 | 15.406071 | 15.232500 | 15.276786 | 12.966883 | 261738400 |
... | ... | ... | ... | ... | ... |
137.589996 | 137.979996 | 127.000000 | 131.399994 | 129.067047 | 200119000 |
126.910004 | 128.839996 | 120.500000 | 120.879997 | 118.733826 | 257599600 |
113.949997 | 118.989998 | 112.680000 | 112.820000 | 110.816933 | 231366600 |
110.400002 | 110.879997 | 106.089996 | 106.839996 | 104.943100 | 287104900 |
125.269997 | 125.389999 | 119.650002 | 121.099998 | 118.949928 | 262330500 |
484 rows × 6 columns
In [ ]:
mean_volume = AAPL['Volume'].mean()
AAPL.query('Volume >= @mean_volume and Open >= Close')
Out[ ]:
OpenHighLowCloseAdj CloseVolumeDate2012-01-092012-01-102012-01-112012-01-122012-01-19...2020-09-022020-09-032020-09-082020-09-182020-10-13
15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
15.211071 | 15.214286 | 15.053571 | 15.115714 | 12.830169 | 258196400 |
15.095714 | 15.101786 | 14.975357 | 15.091071 | 12.809250 | 215084800 |
15.081429 | 15.103571 | 14.955357 | 15.049643 | 12.774082 | 212587200 |
15.362500 | 15.406071 | 15.232500 | 15.276786 | 12.966883 | 261738400 |
... | ... | ... | ... | ... | ... |
137.589996 | 137.979996 | 127.000000 | 131.399994 | 129.067047 | 200119000 |
126.910004 | 128.839996 | 120.500000 | 120.879997 | 118.733826 | 257599600 |
113.949997 | 118.989998 | 112.680000 | 112.820000 | 110.816933 | 231366600 |
110.400002 | 110.879997 | 106.089996 | 106.839996 | 104.943100 | 287104900 |
125.269997 | 125.389999 | 119.650002 | 121.099998 | 118.949928 | 262330500 |
484 rows × 6 columns
apply
In [ ]:
df_kospi = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/KOSPI_stocks.csv', index_col=0)
In [ ]:
df_kospi.head()
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketId01234
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK |
In [ ]:
# df_kospi['ChagesRatio'].apply('함수')
def check_change(data):
if data > 0:
return '오른주식'
else:
return '내린주식'
In [ ]:
df_kospi['방법1'] = df_kospi['ChagesRatio'].apply(check_change)
df_kospi.head()
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketId방법101234
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK | 내린주식 |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK | 오른주식 |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK | 내린주식 |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK | 내린주식 |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK | 내린주식 |
In [ ]:
# 방법2
df_kospi['방법2'] = df_kospi['ChagesRatio'].apply(lambda x : '오른주식' if x > 0 else '내린주식')
df_kospi.head()
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketId방법1방법201234
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK | 내린주식 | 내린주식 |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK | 오른주식 | 오른주식 |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK | 내린주식 | 내린주식 |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK | 내린주식 | 내린주식 |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK | 내린주식 | 내린주식 |
In [ ]:
import numpy as np
df_kospi['방법3'] = np.where(df_kospi['ChagesRatio'] > 0, '오른주식', '내린주식')
df_kospi.head()
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketId방법1방법2방법301234
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK | 내린주식 | 내린주식 | 내린주식 |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK | 오른주식 | 오른주식 | 오른주식 |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK | 내린주식 | 내린주식 | 내린주식 |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK | 내린주식 | 내린주식 | 내린주식 |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK | 내린주식 | 내린주식 | 내린주식 |
value_counts()
In [ ]:
(df_kospi['방법2'] == df_kospi['방법3']).value_counts()
Out[ ]:
True 951
Name: count, dtype: int64
In [ ]:
df_kospi['방법1'].value_counts()
Out[ ]:
방법1
내린주식 583
오른주식 368
Name: count, dtype: int64
rank
In [ ]:
df_kospi['Marcap'].rank(ascending=False)
Out[ ]:
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
...
946 947.0
947 948.0
948 949.0
949 950.0
950 951.0
Name: Marcap, Length: 951, dtype: float64
실습
- 주당 순이익(EPS)이 0 이상 기업 중 PER 순위 - EPS 순위의 차이가 가장 적은 10개의 기업 출력
In [ ]:
df = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/market_fundamental_20230817.csv', index_col = '티커')
df
Out[ ]:
BPSPERPBREPSDIVDPS티커095570006840027410282330138930...079980005010000540000545003280
8076 | 20.95 | 0.52 | 201 | 6.41 | 270 |
41948 | 0.00 | 0.48 | 0 | 1.00 | 200 |
16528 | 14.60 | 0.22 | 247 | 3.05 | 110 |
55724 | 14.63 | 2.94 | 11203 | 2.50 | 4100 |
30468 | 2.74 | 0.22 | 2404 | 9.48 | 625 |
... | ... | ... | ... | ... | ... |
11585 | 0.00 | 0.43 | 0 | 0.00 | 0 |
16181 | 0.87 | 0.30 | 5675 | 7.13 | 350 |
11208 | 1.35 | 0.26 | 2142 | 0.00 | 0 |
0 | 0.00 | 0.00 | 0 | 0.00 | 0 |
544 | 19.84 | 3.43 | 94 | 0.00 | 0 |
920 rows × 6 columns
In [ ]:
df2 = df[df['EPS'] >= 0]
df2['EPS_rank'] = df2['EPS'].rank(ascending=False)
df2['PER_rank'] = df2['PER'].rank()
df2['PER_EPS_diff_rank'] = abs(df2['PER_rank'] - df2['EPS_rank']) # abs 절대값
df2.sort_values('PER_EPS_diff_rank').head(10)
Out[ ]:
BPSPERPBREPSDIVDPSEPS_rankPER_rankPER_EPS_diff_rank티커009180001250129260016090018500093240085310009270088350004150
3726 | 2.83 | 0.67 | 882 | 2.80 | 70 | 371.5 | 371.5 | 0.0 |
4813 | 2.92 | 0.52 | 851 | 0.00 | 0 | 377.0 | 377.5 | 0.5 |
8867 | 2.17 | 0.27 | 1092 | 5.06 | 120 | 341.5 | 344.5 | 3.0 |
5480 | 3.63 | 0.38 | 574 | 5.28 | 110 | 414.5 | 411.0 | 3.5 |
1314 | 4.34 | 1.60 | 485 | 0.00 | 0 | 441.0 | 437.0 | 4.0 |
2200 | 5.79 | 0.69 | 262 | 0.00 | 0 | 487.0 | 480.5 | 6.5 |
1457 | 7.75 | 0.72 | 136 | 0.00 | 0 | 539.0 | 548.0 | 9.0 |
2866 | 5.96 | 0.47 | 224 | 7.49 | 100 | 500.5 | 490.0 | 10.5 |
9020 | 2.60 | 0.25 | 855 | 0.00 | 0 | 376.0 | 363.0 | 13.0 |
13522 | 2.44 | 0.21 | 1144 | 4.29 | 120 | 335.0 | 353.0 | 18.0 |
In [ ]:
# loc 사용
df3 = df[df.loc[:,'EPS'] >=0 ]
df3.loc[:,'EPS_rank'] = df3.loc[:,'EPS'].rank(ascending=False)
df3.loc[:,'PER_rank'] = df3.loc[:,'PER'].rank(ascending=True)
df3.loc[:,'PER_EPS_diff_rank'] = abs(df3.loc[:,'PER_rank'] - df3.loc[:,'EPS_rank'])
df3.sort_values(by=['PER_EPS_diff_rank'], ascending=True, inplace=False).head(10)
Out[ ]:
BPSPERPBREPSDIVDPSEPS_rankPER_rankPER_EPS_diff_rank티커009180001250129260016090018500093240085310009270088350004150
3726 | 2.83 | 0.67 | 882 | 2.80 | 70 | 371.5 | 371.5 | 0.0 |
4813 | 2.92 | 0.52 | 851 | 0.00 | 0 | 377.0 | 377.5 | 0.5 |
8867 | 2.17 | 0.27 | 1092 | 5.06 | 120 | 341.5 | 344.5 | 3.0 |
5480 | 3.63 | 0.38 | 574 | 5.28 | 110 | 414.5 | 411.0 | 3.5 |
1314 | 4.34 | 1.60 | 485 | 0.00 | 0 | 441.0 | 437.0 | 4.0 |
2200 | 5.79 | 0.69 | 262 | 0.00 | 0 | 487.0 | 480.5 | 6.5 |
1457 | 7.75 | 0.72 | 136 | 0.00 | 0 | 539.0 | 548.0 | 9.0 |
2866 | 5.96 | 0.47 | 224 | 7.49 | 100 | 500.5 | 490.0 | 10.5 |
9020 | 2.60 | 0.25 | 855 | 0.00 | 0 | 376.0 | 363.0 | 13.0 |
13522 | 2.44 | 0.21 | 1144 | 4.29 | 120 | 335.0 | 353.0 | 18.0 |
In [ ]:
df_krx = pd.read_csv("https://raw.githubusercontent.com/jin0choi1216/dataset/main/KRX_stocks.csv", index_col=0)
df_krx.head()
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketId01234
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK |
In [ ]:
df_krx['Market'].unique()
Out[ ]:
array(['KOSPI', 'KOSDAQ GLOBAL', 'KOSDAQ', 'KONEX'], dtype=object)
Groupby
In [ ]:
# Market 별 ChagesRatio 의 평균값 출력
df_krx.groupby('Market')['ChagesRatio'].mean()
Out[ ]:
Market
KONEX -0.168281
KOSDAQ -0.391093
KOSDAQ GLOBAL -0.813600
KOSPI 0.107560
Name: ChagesRatio, dtype: float64
In [ ]:
#df_krx['Market'] # 시리즈 pandas.core.series.Series
#df_krx['Market','ChagesRatio'] # 다수의 컬럼을 데이터프레임으로 출력하고 싶은데 오류 발생한다.
# 'Market' 은 시리즈이다. 여러개의 시리즈를 입력하는 방법으로 리스트를 사용한다. 리스트 ['Market','ChagesRatio']
df_krx[['Market','ChagesRatio']] # 데이터프레임. type(df_krx[['Market','ChagesRatio']]) -> pandas.core.frame.DataFrame
Out[ ]:
MarketChagesRatio01234...27432744274527462747
KOSPI | -1.61 |
KOSPI | 2.06 |
KOSPI | -3.64 |
KOSPI | -0.79 |
KOSPI | -0.35 |
... | ... |
KOSPI | -0.10 |
KONEX | -1.75 |
KONEX | 7.14 |
KONEX | -2.80 |
KONEX | -12.12 |
2748 rows × 2 columns
In [ ]:
df_krx[['Market','ChagesRatio','Stocks']].groupby('Market').agg({'ChagesRatio' : 'max', 'Stocks' : 'min'})
Out[ ]:
ChagesRatioStocksMarketKONEXKOSDAQKOSDAQ GLOBALKOSPI
14.89 | 900000 |
29.99 | 153536 |
13.70 | 6275415 |
30.00 | 200000 |
In [ ]:
df_krx[['Market','ChagesRatio','Stocks']].groupby('Market').agg({'ChagesRatio' : ['max','sum'], 'Stocks' : ['min','sum']})
Out[ ]:
ChagesRatioStocksmaxsumminsumMarketKONEXKOSDAQKOSDAQ GLOBALKOSPI
14.89 | -21.54 | 900000 | 1021509220 |
29.99 | -633.18 | 153536 | 49395684060 |
13.70 | -40.68 | 6275415 | 1893215335 |
30.00 | 102.29 | 200000 | 62776937252 |
Merge(== join)
In [ ]:
market_price_change = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/market_price_change_2022.csv')
purchases_of_equities = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/purchases_of_equities_2022.csv')
In [ ]:
(market_price_change)
Out[ ]:
티커종목명시가종가변동폭등락률거래량거래대금01234...937938939940941
095570 | AJ네트웍스 | 6110 | 5720 | -390 | -6.38 | 2947645 | 17162545200 |
006840 | AK홀딩스 | 15150 | 17200 | 2050 | 13.53 | 455492 | 7706148650 |
027410 | BGF | 4365 | 4305 | -60 | -1.37 | 14388855 | 62685829240 |
282330 | BGF리테일 | 205500 | 210500 | 5000 | 2.43 | 712994 | 146603104000 |
138930 | BNK금융지주 | 7390 | 6500 | -890 | -12.04 | 37130723 | 264104001940 |
... | ... | ... | ... | ... | ... | ... | ... |
005010 | 휴스틸 | 5420 | 5070 | -350 | -6.46 | 9608972 | 49226534830 |
000540 | 흥국화재 | 3165 | 3370 | 205 | 6.48 | 15972406 | 59888402133 |
000547 | 흥국화재2우B | 19300 | 16200 | -3100 | -16.06 | 18918 | 364549550 |
000545 | 흥국화재우 | 6020 | 6150 | 130 | 2.16 | 68957 | 432345040 |
003280 | 흥아해운 | 1625 | 1355 | -270 | -16.62 | 4552374 | 6915159460 |
942 rows × 8 columns
In [ ]:
(purchases_of_equities)
Out[ ]:
티커종목명매도거래량매수거래량순매수거래량매도거래대금매수거래대금순매수거래대금01234...26202621262226232624
060310 | 3S | 3318765 | 3318765 | 0 | 7721430565 | 7721430565 | 0 |
095570 | AJ네트웍스 | 2947645 | 2947645 | 0 | 17162545200 | 17162545200 | 0 |
006840 | AK홀딩스 | 455492 | 455492 | 0 | 7706148650 | 7706148650 | 0 |
054620 | APS홀딩스 | 4001612 | 4001612 | 0 | 59468996600 | 59468996600 | 0 |
265520 | AP시스템 | 1662030 | 1662030 | 0 | 31586148100 | 31586148100 | 0 |
... | ... | ... | ... | ... | ... | ... | ... |
000547 | 흥국화재2우B | 18918 | 18918 | 0 | 364549550 | 364549550 | 0 |
000545 | 흥국화재우 | 68957 | 68957 | 0 | 432345040 | 432345040 | 0 |
003280 | 흥아해운 | 4552374 | 4552374 | 0 | 6915159460 | 6915159460 | 0 |
037440 | 희림 | 14299451 | 14299451 | 0 | 144957759090 | 144957759090 | 0 |
238490 | 힘스 | 849304 | 849304 | 0 | 5529584340 | 5529584340 | 0 |
2625 rows × 8 columns
join 이해도가 높으면 좋겠다....
In [ ]:
merge1 = pd.merge(left=market_price_change, right=purchases_of_equities, on=['티커','종목명'])
merge1.shape
Out[ ]:
(916, 14)
In [ ]:
merge2 = pd.merge(left=market_price_change, right=purchases_of_equities, on=['티커','종목명'], how = 'left')
merge2.shape
Out[ ]:
(942, 14)
In [ ]:
merge3 = pd.merge(left=market_price_change, right=purchases_of_equities, on=['티커','종목명'], how = 'right')
merge3.shape
Out[ ]:
(2625, 14)
In [ ]:
merge4 = pd.merge(left=market_price_change, right=purchases_of_equities, on=['티커','종목명'], how = 'outer')
merge4.shape
Out[ ]:
(2651, 14)
In [ ]:
# 칼럼이름이 서로 다를 때
purchases_of_equities2 = purchases_of_equities.rename(columns= {'티커' : '이름변경_티커','종목명' : '이름변경_종목명'})
purchases_of_equities2.head(2) # 임의로 컬럼명을 변경
Out[ ]:
이름변경_티커이름변경_종목명매도거래량매수거래량순매수거래량매도거래대금매수거래대금순매수거래대금01
060310 | 3S | 3318765 | 3318765 | 0 | 7721430565 | 7721430565 | 0 |
095570 | AJ네트웍스 | 2947645 | 2947645 | 0 | 17162545200 | 17162545200 | 0 |
In [ ]:
pd.merge(left=market_price_change, right=purchases_of_equities2
,left_on = ['티커','종목명'], right_on = ['이름변경_티커','이름변경_종목명'], how='inner')
Out[ ]:
티커종목명시가종가변동폭등락률거래량거래대금이름변경_티커이름변경_종목명매도거래량매수거래량순매수거래량매도거래대금매수거래대금순매수거래대금01234...911912913914915
095570 | AJ네트웍스 | 6110 | 5720 | -390 | -6.38 | 2947645 | 17162545200 | 095570 | AJ네트웍스 | 2947645 | 2947645 | 0 | 17162545200 | 17162545200 | 0 |
006840 | AK홀딩스 | 15150 | 17200 | 2050 | 13.53 | 455492 | 7706148650 | 006840 | AK홀딩스 | 455492 | 455492 | 0 | 7706148650 | 7706148650 | 0 |
027410 | BGF | 4365 | 4305 | -60 | -1.37 | 14388855 | 62685829240 | 027410 | BGF | 14388855 | 14388855 | 0 | 62685829240 | 62685829240 | 0 |
282330 | BGF리테일 | 205500 | 210500 | 5000 | 2.43 | 712994 | 146603104000 | 282330 | BGF리테일 | 712994 | 712994 | 0 | 146603104000 | 146603104000 | 0 |
138930 | BNK금융지주 | 7390 | 6500 | -890 | -12.04 | 37130723 | 264104001940 | 138930 | BNK금융지주 | 37130723 | 37130723 | 0 | 264104001940 | 264104001940 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
005010 | 휴스틸 | 5420 | 5070 | -350 | -6.46 | 9608972 | 49226534830 | 005010 | 휴스틸 | 9608972 | 9608972 | 0 | 49226534830 | 49226534830 | 0 |
000540 | 흥국화재 | 3165 | 3370 | 205 | 6.48 | 15972406 | 59888402133 | 000540 | 흥국화재 | 15972406 | 15972406 | 0 | 59888402133 | 59888402133 | 0 |
000547 | 흥국화재2우B | 19300 | 16200 | -3100 | -16.06 | 18918 | 364549550 | 000547 | 흥국화재2우B | 18918 | 18918 | 0 | 364549550 | 364549550 | 0 |
000545 | 흥국화재우 | 6020 | 6150 | 130 | 2.16 | 68957 | 432345040 | 000545 | 흥국화재우 | 68957 | 68957 | 0 | 432345040 | 432345040 | 0 |
003280 | 흥아해운 | 1625 | 1355 | -270 | -16.62 | 4552374 | 6915159460 | 003280 | 흥아해운 | 4552374 | 4552374 | 0 | 6915159460 | 6915159460 | 0 |
916 rows × 16 columns
결측치 개수확인
결합 후 결측치 처리하기
In [ ]:
merge4.isnull().sum()
Out[ ]:
티커 0
종목명 0
시가 1709
종가 1709
변동폭 1709
등락률 1709
거래량 1709
거래대금 1709
매도거래량 26
매수거래량 26
순매수거래량 26
매도거래대금 26
매수거래대금 26
순매수거래대금 26
dtype: int64
pivot_table
index column values 명확할때 유리. 데이터를 재정렬할 때 등
그 외 groupby가 유리
In [ ]:
GOOGL = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/GOOGL.csv')
AAPL = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/AAPL.csv')
In [ ]:
# Symbol 을 통해서 데이터를 구분할 수 있다. 그러나 한계점이 있다. Date가 중복된다.
GOOGL['Symbol'] = 'GOOGLE'
AAPL['Symbol'] = 'AAPL'
df = pd.concat([GOOGL, AAPL], axis=0)
df
Out[ ]:
DateOpenHighLowCloseAdj CloseVolumeSymbol01234...29262927292829292930
2012-01-03 | 16.339840 | 16.720470 | 16.325577 | 16.651901 | 16.651901 | 146912940 | |
2012-01-04 | 16.642391 | 16.773024 | 16.532032 | 16.723724 | 16.723724 | 114445440 | |
2012-01-05 | 16.569820 | 16.615866 | 16.422173 | 16.491741 | 16.491741 | 131184684 | |
2012-01-06 | 16.495245 | 16.516518 | 16.261011 | 16.266768 | 16.266768 | 107608284 | |
2012-01-09 | 16.178679 | 16.191191 | 15.546296 | 15.577077 | 15.577077 | 232671096 | |
... | ... | ... | ... | ... | ... | ... | ... |
2023-08-21 | 175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 | AAPL |
2023-08-22 | 177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 | AAPL |
2023-08-23 | 178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 | AAPL |
2023-08-24 | 180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 | AAPL |
2023-08-25 | 177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 | AAPL |
5862 rows × 8 columns
In [ ]:
pd.pivot_table(data=df, index='Date', columns='Symbol', values='Close')
Out[ ]:
SymbolAAPLGOOGLEDate2012-01-032012-01-042012-01-052012-01-062012-01-09...2023-08-212023-08-222023-08-232023-08-242023-08-25
14.686786 | 16.651901 |
14.765714 | 16.723724 |
14.929643 | 16.491741 |
15.085714 | 16.266768 |
15.061786 | 15.577077 |
... | ... |
175.839996 | 128.369995 |
177.229996 | 129.080002 |
181.119995 | 132.369995 |
176.380005 | 129.779999 |
178.610001 | 129.880005 |
2931 rows × 2 columns
In [ ]:
# groupby 비슷한 사용예시
df[['Date','Symbol','Close']].groupby(['Date','Symbol']).mean()
Out[ ]:
CloseDateSymbol2012-01-03AAPLGOOGLE2012-01-04AAPLGOOGLE2012-01-05AAPL......2023-08-23GOOGLE2023-08-24AAPLGOOGLE2023-08-25AAPLGOOGLE
14.686786 |
16.651901 |
14.765714 |
16.723724 |
14.929643 |
... |
132.369995 |
176.380005 |
129.779999 |
178.610001 |
129.880005 |
5862 rows × 1 columns
In [ ]:
# groupby 비슷한 사용예시
pd.pivot_table(data=df, index=['Date','Symbol'], values='Close')
Out[ ]:
CloseDateSymbol2012-01-03AAPLGOOGLE2012-01-04AAPLGOOGLE2012-01-05AAPL......2023-08-23GOOGLE2023-08-24AAPLGOOGLE2023-08-25AAPLGOOGLE
14.686786 |
16.651901 |
14.765714 |
16.723724 |
14.929643 |
... |
132.369995 |
176.380005 |
129.779999 |
178.610001 |
129.880005 |
5862 rows × 1 columns
시계열데이터다루기
datetime 실습
In [ ]:
from datetime import datetime
type(datetime.now())
# 자료형 : datetime
Out[ ]:
datetime.datetime
In [ ]:
date1 = datetime(2023, 9, 12)
date1
Out[ ]:
datetime.datetime(2023, 9, 12, 0, 0)
In [ ]:
# 연, 월, 일, 시, 분, 초
date2 = datetime(2023, 9, 12, 15, 33, 3)
date2
Out[ ]:
datetime.datetime(2023, 9, 12, 15, 33, 3)
In [ ]:
print(date2.year)
print(date2.month)
print(date2.day)
print(date2.hour)
print(date2.weekday()) # 월요일 : 0
2023
9
12
15
1
In [ ]:
date3 = '2023-09-12' # 문자열 자료형이다.
#print(date3.year) # 문자열이므로 year을 사용할 수 없다.
datetime_date3 = pd.to_datetime(date3)
print(datetime_date3.year)
2023
In [ ]:
date4 = '23-09-12' # 2023-09-12. format 으로 형식 정의
pd.to_datetime(date4, format='%y-%m-%d')
Out[ ]:
Timestamp('2023-09-12 00:00:00')
In [ ]:
# %Y%m%d %H:%M:%S # 일반적인 format 사용 예시
datetime.strftime(datetime.now(),format="%Y%m%d %H:%M:%S")
Out[ ]:
'20230912 17:43:15'
In [ ]:
date5 = '2/2/2020'
date6 = 'Jan 01, 2018'
In [ ]:
dt1 = pd.to_datetime(date5, format='%m/%d/%Y')
dt1
Out[ ]:
Timestamp('2020-02-02 00:00:00')
In [ ]:
dt2 = pd.to_datetime(date6, format='%b %d, %Y')
dt2
Out[ ]:
Timestamp('2018-01-01 00:00:00')
In [ ]:
dt1.strftime("%Y년 %m월 %d일")
Out[ ]:
'2020년 02월 02일'
시계열 실습
In [ ]:
AAPL = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/AAPL.csv')
AAPL.head()
Out[ ]:
DateOpenHighLowCloseAdj CloseVolume01234
2012-01-03 | 14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 |
2012-01-04 | 14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 |
2012-01-05 | 14.819643 | 14.948214 | 14.738214 | 14.929643 | 12.672229 | 271269600 |
2012-01-06 | 14.991786 | 15.098214 | 14.972143 | 15.085714 | 12.804703 | 318292800 |
2012-01-09 | 15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 |
In [ ]:
AAPL['Date'] = pd.to_datetime(AAPL['Date'])
In [ ]:
AAPL.info() # Date Dtype : datetime64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2931 entries, 0 to 2930
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2931 non-null datetime64[ns]
1 Open 2931 non-null float64
2 High 2931 non-null float64
3 Low 2931 non-null float64
4 Close 2931 non-null float64
5 Adj Close 2931 non-null float64
6 Volume 2931 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 160.4 KB
In [ ]:
print(type(AAPL['Date'])) # type == Series
print(type(AAPL['Date'].dt)) # type == Datetime
<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.accessors.DatetimeProperties'>
In [ ]:
AAPL['Year'] = AAPL['Date'].dt.year
AAPL['Month'] = AAPL['Date'].dt.month
AAPL['day'] = AAPL['Date'].dt.day
AAPL.head()
Out[ ]:
DateOpenHighLowCloseAdj CloseVolumeYearMonthday01234
2012-01-03 | 14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 | 2012 | 1 | 3 |
2012-01-04 | 14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 | 2012 | 1 | 4 |
2012-01-05 | 14.819643 | 14.948214 | 14.738214 | 14.929643 | 12.672229 | 271269600 | 2012 | 1 | 5 |
2012-01-06 | 14.991786 | 15.098214 | 14.972143 | 15.085714 | 12.804703 | 318292800 | 2012 | 1 | 6 |
2012-01-09 | 15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 | 2012 | 1 | 9 |
In [ ]:
# Close 전날과의 차이를 구하기
#AAPL['Close'].iloc[1:] - AAPL['Close'].iloc[:-1] # 이렇게 reset_index() 등 사용해서 처리할수 있지만
AAPL['Close'].diff() # 이런 함수를 사용할 수 있다.
Out[ ]:
0 NaN
1 0.078928
2 0.163929
3 0.156071
4 -0.023928
...
2926 1.349991
2927 1.390000
2928 3.889999
2929 -4.739990
2930 2.229996
Name: Close, Length: 2931, dtype: float64
In [ ]:
AAPL['Close'].pct_change() # 퍼센트. Fractional change between the current and a prior element
Out[ ]:
0 NaN
1 0.005374
2 0.011102
3 0.010454
4 -0.001586
...
2926 0.007737
2927 0.007905
2928 0.021949
2929 -0.026170
2930 0.012643
Name: Close, Length: 2931, dtype: float64
In [ ]:
AAPL['Close_shift'] = AAPL['Close'].shift(1)
AAPL
Out[ ]:
DateOpenHighLowCloseAdj CloseVolumeYearMonthdayClose_shift01234...29262927292829292930
2012-01-03 | 14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 | 2012 | 1 | 3 | NaN |
2012-01-04 | 14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 | 2012 | 1 | 4 | 14.686786 |
2012-01-05 | 14.819643 | 14.948214 | 14.738214 | 14.929643 | 12.672229 | 271269600 | 2012 | 1 | 5 | 14.765714 |
2012-01-06 | 14.991786 | 15.098214 | 14.972143 | 15.085714 | 12.804703 | 318292800 | 2012 | 1 | 6 | 14.929643 |
2012-01-09 | 15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 | 2012 | 1 | 9 | 15.085714 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2023-08-21 | 175.070007 | 176.130005 | 173.740005 | 175.839996 | 175.839996 | 46311900 | 2023 | 8 | 21 | 174.490005 |
2023-08-22 | 177.059998 | 177.679993 | 176.250000 | 177.229996 | 177.229996 | 42084200 | 2023 | 8 | 22 | 175.839996 |
2023-08-23 | 178.520004 | 181.550003 | 178.330002 | 181.119995 | 181.119995 | 52722800 | 2023 | 8 | 23 | 177.229996 |
2023-08-24 | 180.669998 | 181.100006 | 176.009995 | 176.380005 | 176.380005 | 54945800 | 2023 | 8 | 24 | 181.119995 |
2023-08-25 | 177.380005 | 179.149994 | 175.820007 | 178.610001 | 178.610001 | 51418700 | 2023 | 8 | 25 | 176.380005 |
2931 rows × 11 columns
누적평균 vs 이동평균
In [ ]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.expanding.html
AAPL['Close'].expanding().mean() # 누적평균
Out[ ]:
0 14.686786
1 14.726250
2 14.794048
3 14.866964
4 14.905929
...
2926 64.509729
2927 64.548227
2928 64.588026
2929 64.626180
2930 64.665069
Name: Close, Length: 2931, dtype: float64
In [ ]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
AAPL['MA5'] = AAPL['Close'].rolling(window=5).mean() # 이동평균(window_size=5)
AAPL.head(15)
Out[ ]:
DateOpenHighLowCloseAdj CloseVolumeYearMonthdayClose_shiftMA501234567891011121314
2012-01-03 | 14.621429 | 14.732143 | 14.607143 | 14.686786 | 12.466090 | 302220800 | 2012 | 1 | 3 | NaN | NaN |
2012-01-04 | 14.642857 | 14.810000 | 14.617143 | 14.765714 | 12.533089 | 260022000 | 2012 | 1 | 4 | 14.686786 | NaN |
2012-01-05 | 14.819643 | 14.948214 | 14.738214 | 14.929643 | 12.672229 | 271269600 | 2012 | 1 | 5 | 14.765714 | NaN |
2012-01-06 | 14.991786 | 15.098214 | 14.972143 | 15.085714 | 12.804703 | 318292800 | 2012 | 1 | 6 | 14.929643 | NaN |
2012-01-09 | 15.196429 | 15.276786 | 15.048214 | 15.061786 | 12.784389 | 394024400 | 2012 | 1 | 9 | 15.085714 | 14.905929 |
2012-01-10 | 15.211071 | 15.214286 | 15.053571 | 15.115714 | 12.830169 | 258196400 | 2012 | 1 | 10 | 15.061786 | 14.991714 |
2012-01-11 | 15.095714 | 15.101786 | 14.975357 | 15.091071 | 12.809250 | 215084800 | 2012 | 1 | 11 | 15.115714 | 15.056786 |
2012-01-12 | 15.081429 | 15.103571 | 14.955357 | 15.049643 | 12.774082 | 212587200 | 2012 | 1 | 12 | 15.091071 | 15.080786 |
2012-01-13 | 14.989286 | 15.016071 | 14.952143 | 14.993214 | 12.726187 | 226021600 | 2012 | 1 | 13 | 15.049643 | 15.062286 |
2012-01-17 | 15.150000 | 15.213929 | 15.105714 | 15.167857 | 12.874424 | 242897200 | 2012 | 1 | 17 | 14.993214 | 15.083500 |
2012-01-18 | 15.248571 | 15.338214 | 15.225000 | 15.325357 | 13.008112 | 276791200 | 2012 | 1 | 18 | 15.167857 | 15.125428 |
2012-01-19 | 15.362500 | 15.406071 | 15.232500 | 15.276786 | 12.966883 | 261738400 | 2012 | 1 | 19 | 15.325357 | 15.162571 |
2012-01-20 | 15.267500 | 15.267857 | 14.991071 | 15.010714 | 12.741042 | 413974400 | 2012 | 1 | 20 | 15.276786 | 15.154786 |
2012-01-23 | 15.095357 | 15.301786 | 15.082143 | 15.264643 | 12.956573 | 306062400 | 2012 | 1 | 23 | 15.010714 | 15.209071 |
2012-01-24 | 15.182143 | 15.182143 | 14.983929 | 15.014643 | 12.744378 | 547638000 | 2012 | 1 | 24 | 15.264643 | 15.178429 |
과제
- df_krx의 Marcap(시가총액)를 활용하여 시가총액 순위를 새로운 칼럼으로 정의 (칼럼명 : Marcap_rank)
- Marcap(시가총액)이 높을수록 낮은순위(1등에 가깝다.)
- 순위데이터(Marcap_rank)를 활용하여 100위 이하는 large-cap, 100~300위는 mid-cap, 이외 순위는 small-cap으로 새로운 칼럼 선언(칼럼명 : Marcap_size)
- Marcap_size(large-cap, mid-cap,small-cap) 수익률(ChagesRatio)의 평균을 출력
In [ ]:
import pandas as pd
df_krx = pd.read_csv("https://raw.githubusercontent.com/jin0choi1216/dataset/main/KRX_stocks.csv", index_col=0)
df_krx.head()
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketId01234
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK |
In [ ]:
#1. df_krx의 Marcap(시가총액)를 활용하여 시가총액 순위를 새로운 칼럼으로 정의 (칼럼명 : Marcap_rank)
# - Marcap(시가총액)이 높을수록 낮은순위(1등에 가깝다.)
df_krx['Marcop_rank'] = df_krx['Marcap'].rank(ascending=False)
df_krx
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketIdMarcop_rank01234...27432744274527462747
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK | 1.0 |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK | 2.0 |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK | 3.0 |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK | 4.0 |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK | 5.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
001527 | KR7001522002 | 동양2우B | KOSPI | NaN | 10150 | 2 | -10 | -0.10 | 10040 | 10220 | 10040 | 53 | 534860 | 3131975350 | 308569 | STK | 2744.0 |
288490 | KR7288490006 | 나라소프트 | KONEX | 일반기업부 | 56 | 2 | -1 | -1.75 | 58 | 58 | 55 | 50157 | 2797792 | 2422952280 | 43267005 | KNX | 2745.0 |
245450 | KR7245450002 | 씨앤에스링크 | KONEX | 일반기업부 | 1500 | 1 | 100 | 7.14 | 1500 | 1500 | 1500 | 1 | 1500 | 2369940000 | 1579960 | KNX | 2746.0 |
322190 | KR7322190000 | 베른 | KONEX | 일반기업부 | 139 | 2 | -4 | -2.80 | 150 | 150 | 122 | 10414 | 1281538 | 1240602383 | 8925197 | KNX | 2747.0 |
308700 | KR7308700004 | 테크엔 | KONEX | 일반기업부 | 203 | 2 | -28 | -12.12 | 265 | 265 | 201 | 3076 | 811976 | 812000000 | 4000000 | KNX | 2748.0 |
2748 rows × 18 columns
In [ ]:
# 2. 순위데이터(Marcap_rank)를 활용하여 100위 이하는 large-cap,
# 100~300위는 mid-cap, 이외 순위는 small-cap으로 새로운 칼럼 선언(칼럼명 : Marcap_size)
df_krx['Marcap_size'] = df_krx['Marcop_rank'].apply(lambda x: 'large-cap' if x < 100 else('mid-cap' if x < 300 else 'small-cap'))
df_krx
Out[ ]:
CodeISU_CDNameMarketDeptCloseChangeCodeChangesChagesRatioOpenHighLowVolumeAmountMarcapStocksMarketIdMarcop_rankMarcap_size01234...27432744274527462747
005930 | KR7005930003 | 삼성전자 | KOSPI | NaN | 67100 | 2 | -1100 | -1.61 | 67100 | 67400 | 66900 | 7032462 | 471934306900 | 400572409105000 | 5969782550 | STK | 1.0 | large-cap |
373220 | KR7373220003 | LG에너지솔루션 | KOSPI | NaN | 546000 | 1 | 11000 | 2.06 | 527000 | 549000 | 525000 | 249493 | 135513119000 | 127764000000000 | 234000000 | STK | 2.0 | large-cap |
000660 | KR7000660001 | SK하이닉스 | KOSPI | NaN | 116500 | 2 | -4400 | -3.64 | 117400 | 118300 | 115300 | 3533647 | 412064064200 | 84812275522500 | 728002365 | STK | 3.0 | large-cap |
207940 | KR7207940008 | 삼성바이오로직스 | KOSPI | NaN | 757000 | 2 | -6000 | -0.79 | 755000 | 764000 | 754000 | 23435 | 17763884000 | 53878718000000 | 71174000 | STK | 4.0 | large-cap |
005490 | KR7005490008 | POSCO홀딩스 | KOSPI | NaN | 577000 | 2 | -2000 | -0.35 | 567000 | 583000 | 561000 | 904736 | 519579225000 | 48797599710000 | 84571230 | STK | 5.0 | large-cap |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
001527 | KR7001522002 | 동양2우B | KOSPI | NaN | 10150 | 2 | -10 | -0.10 | 10040 | 10220 | 10040 | 53 | 534860 | 3131975350 | 308569 | STK | 2744.0 | small-cap |
288490 | KR7288490006 | 나라소프트 | KONEX | 일반기업부 | 56 | 2 | -1 | -1.75 | 58 | 58 | 55 | 50157 | 2797792 | 2422952280 | 43267005 | KNX | 2745.0 | small-cap |
245450 | KR7245450002 | 씨앤에스링크 | KONEX | 일반기업부 | 1500 | 1 | 100 | 7.14 | 1500 | 1500 | 1500 | 1 | 1500 | 2369940000 | 1579960 | KNX | 2746.0 | small-cap |
322190 | KR7322190000 | 베른 | KONEX | 일반기업부 | 139 | 2 | -4 | -2.80 | 150 | 150 | 122 | 10414 | 1281538 | 1240602383 | 8925197 | KNX | 2747.0 | small-cap |
308700 | KR7308700004 | 테크엔 | KONEX | 일반기업부 | 203 | 2 | -28 | -12.12 | 265 | 265 | 201 | 3076 | 811976 | 812000000 | 4000000 | KNX | 2748.0 | small-cap |
2748 rows × 19 columns
In [ ]:
df_krx['ChagesRatio']
Out[ ]:
0 -1.61
1 2.06
2 -3.64
3 -0.79
4 -0.35
...
2743 -0.10
2744 -1.75
2745 7.14
2746 -2.80
2747 -12.12
Name: ChagesRatio, Length: 2748, dtype: float64
In [ ]:
# 3. Marcap_size(large-cap, mid-cap,small-cap) 수익률(ChagesRatio)의 평균을 출력
df_krx.groupby(['Marcap_size']).agg({'ChagesRatio' : ['mean']})
Out[ ]:
ChagesRatiomeanMarcap_sizelarge-capmid-capsmall-cap
-0.151919 |
-0.641350 |
-0.183667 |
In [ ]:
df_krx[['ChagesRatio','Marcap_size']].groupby('Marcap_size').mean()
Out[ ]:
ChagesRatioMarcap_sizelarge-capmid-capsmall-cap
-0.151919 |
-0.641350 |
-0.183667 |
In [ ]:
df_krx.groupby('Marcap_size')['ChagesRatio'].mean()
Out[ ]:
Marcap_size
large-cap -0.151919
mid-cap -0.641350
small-cap -0.183667
Name: ChagesRatio, dtype: float64
In [ ]:
df_krx[['Marcap_size', 'ChagesRatio']].groupby(by="Marcap_size").mean()
Out[ ]:
ChagesRatioMarcap_sizelarge-capmid-capsmall-cap
-0.151919 |
-0.641350 |
-0.183667 |
In [ ]:
mean_chages_ratio = pd.pivot_table(data = df_krx, columns='Marcap_size', values='ChagesRatio', aggfunc='mean')
mean_chages_ratio
Out[ ]:
Marcap_sizelarge-capmid-capsmall-capChagesRatio
-0.151919 | -0.64135 | -0.183667 |
'새싹 > TIL' 카테고리의 다른 글
[핀테커스] 230914 데이터 시각화 (3) | 2023.09.14 |
---|---|
[핀테커스] 230913 pandas merge & metplot 시각화 (0) | 2023.09.13 |
[핀테커스] 230911 데이터 사이언스 라이브러리 pandas 실습 (0) | 2023.09.11 |
[핀테커스] 230908 postgresql 실습 (0) | 2023.09.08 |
[핀테커스] 230907 sqlite sql 실습 (0) | 2023.09.06 |