1_팬더스, 데이터프레임, 시리즈 알아보기 Pandas : 파이썬 라이브러리로, 데이터를 수정하고 목적에 맞게 변경시키기 위해 상당히 중요한 라이브러리
import pandas as pd
data_frame = pd.read_csv('data/friend_list_no_head.csv',names=[ "name","age","job"])
#names=[] 열, 변수 이름 지정 가능
data_frame
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
# pandas를 배우는 이유, 엑셀은 프로그램을 만들 수 없음, pandas는 numpy를 사용 숫자 계산이 상당히 빠름
data_frame.head(2)
name
age
job
0
John
20
student
1
Jenny
30
developer
data_frame.tail(2)
name
age
job
4
Brian
45
manager
5
Chris
25
intern
# column 들이 series로 구성됨, 시리즈로 구성된 것이 데이터 프레임
type(data_frame.name)
pandas.core.series.Series
type(data_frame.age)
pandas.core.series.Series
type(data_frame.job)
pandas.core.series.Series
list_tmp=[1,2,3]
s1= pd.core.series.Series([1,2,3])
s2=pd.core.series.Series(["one","two","three"])
pd.DataFrame(data = dict(num=s1,word=s2)) # dict 딕셔너리
num
word
0
1
one
1
2
two
2
3
three
# 파이썬 리스트를 사용해 시리즈를 만들고 시리즈를 이용해 데이터 프레임을 생성
2_파일에서 데이터 불러오기 !conda list | findstr pandas
pandas 0.25.1 py37ha925a31_0
import pandas as pd
df = pd.read_csv('data/friend_list_no_head.csv')
df
John
20
student
0
Jenny
30
developer
1
Nate
30
teacher
2
Julia
40
dentist
3
Brian
45
manager
4
Chris
25
intern
df.head(2)
John
20
student
0
Jenny
30
developer
1
Nate
30
teacher
df.tail(2)
John
20
student
3
Brian
45
manager
4
Chris
25
intern
df=pd.read_csv('data/friend_list.txt')
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
쉼표가 아닌 탭으로 구분되어 있는 경우 : delimiter 옵션 사용 df=pd.read_csv('data/friend_list_tab.txt')
df
name\tage\tjob
0
John\t20\tstudent
1
Jenny\t30\tdeveloper
2
Nate\t30\tteacher
3
Julia\t40\tdentist
4
Brian\t45\tmanager
5
Chris\t25\tintern
df=pd.read_csv('data/friend_list_tab.txt', delimiter = '\t') # 탭으로 구분
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
df=pd.read_csv('data/friend_list_no_head.csv', header=None)
df
0
1
2
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
# column 이름이 없는 경우
df.columns=['name','age','job'] #header의 정보를 알려줌
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
df=pd.read_csv('data/friend_list_no_head.csv', header=None, names=['name','age','job']) #names=[]
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
3_데이터프레임 생성하기 딕셔너리를 사용하여 데이터 프레임 생성하기 import pandas as pd
friend_dict_list =[
{'name':'John','age':25,'job':'student'},
{'name':'Nate','age':30,'job':'teacher'}
]
df= pd.DataFrame(friend_dict_list)
df.head() # 딕셔너리 키의 값의 순서가 보장되지 않음.
name
age
job
0
John
25
student
1
Nate
30
teacher
df=df[['name','age','job']]
df
name
age
job
0
John
25
student
1
Nate
30
teacher
OrderedDict : 키의 순서가 보장됨 # 한 번에
from collections import OrderedDict
# 딕셔너리 선언
friend_ordered_dict=OrderedDict(
[
('name',['John','Nate']),
('age',[25,30]),
('job',['student','teacher'])
]
)
df= pd.DataFrame.from_dict(friend_ordered_dict)
df.head()
name
age
job
0
John
25
student
1
Nate
30
teacher
리스트를 사용해 데이터프레임 생성하기 friend_list=[
['John',20,'student'],
['Nate',30,'teacher']
]
# 컬럼 헤더 정보
column_name=['name','age','job']
df= pd.DataFrame.from_records(friend_list, columns=column_name)
df.head()
name
age
job
0
John
20
student
1
Nate
30
teacher
# 한 번에
friend_list=[
['name',['John','Nate']],
['age',[20,30]],
['job',['student','teacher']],
]
df= pd.DataFrame.from_items(friend_list) # 현재 버전에선 사용 안 됨
C:\Users\82107\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
"""Entry point for launching an IPython kernel.
df= pd.DataFrame.from_dict(OrderedDict(friend_list))
df
name
age
job
0
John
20
student
1
Nate
30
teacher
4_ 데이터 프레임 파일로 저장하기 import pandas as pd
friends=[{'name':'Jone','age':20,'job':'student'},
{'name':'Jenny','age':30,'job':None}, # None 빈 셀로 표시됨
{'name':'Nate','age':30,'job':'teacher'}
]
df= pd.DataFrame(friends)
df= df[['name','age','job']]
df.head()
name
age
job
0
Jone
20
student
1
Jenny
30
None
2
Nate
30
teacher
데이터 프레임을 csv 파일로 저장하기 df.to_csv('friends.csv', index=True, header= True) #index=True, header= True는 default 값
# row id 생락, index= False, column 이름 생략 header= False
df.to_csv('friends.csv', index=False, header= False, na_rep='-') # None 값 대신 '-'
5_데이터프레임 행, 열(row, column) 선택 및 필터하기 import pandas as pd
friends_list=[
['name',['John','Jenny','Nate']],
['age',[20,30,30]],
['job',['student','developer','teacher']]
]
df=pd.DataFrame.from_dict(OrderedDict(friends_list))
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
Jenny, Nate의 정보만 가져오고 싶을 때 df[1:3] # 1번 인덱스와 2번 인덱스를 불러와라, 결과값이 df 데이터프레임에는 적용되지 않는 것을 볼 수 있음.
name
age
job
1
Jenny
30
developer
2
Nate
30
teacher
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
# 결과값을 저장하고 싶을 때
df= df[1:3]
df
name
age
job
1
Jenny
30
developer
2
Nate
30
teacher
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
각 행을 따로 불러오고 싶을 때 : df.loc[[ ]] 사용 df.loc[[0,2]]
name
age
job
0
John
20
student
2
Nate
30
teacher
df=df.loc[[0,2]]
df
name
age
job
0
John
20
student
2
Nate
30
teacher
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
by column condition 컬럼의 조건에 따라 row 선택 25살 이상인 사람의 정보만 선택 # 첫 번째 방법
df[df.age>=25]
name
age
job
1
Jenny
30
developer
2
Nate
30
teacher
# 두 번째 방법
df.query('age>=25')
name
age
job
1
Jenny
30
developer
2
Nate
30
teacher
나이가 25살 이상이고 이름이 Nate인 사람의 정보만 df[(df.age>=25) & (df.name=='Nate')] # and=>&
name
age
job
2
Nate
30
teacher
Filter Column by index friend_list=[
['John',20,'student'],
['Jenny',30,'developer'],
['Nate',30,'teacher']
]
df=pd.DataFrame.from_records(friend_list)
df
0
1
2
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
# 2번째 컬럼 제외
# 첫번째 방법
df.iloc[:,0:2] # 모든 row를 불러오고 column은 0번째 1번째만
0
1
0
John
20
1
Jenny
30
2
Nate
30
df.iloc[0:2,0:2]
by column name # 두번째 방법
# csv 파일을 불러옴
df= pd.read_csv('data/friend_list_no_head.csv', header=None, names=['name','age','job'])
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
df_filtered= df[['name','age']]
df_filtered
name
age
0
John
20
1
Jenny
30
2
Nate
30
3
Julia
40
4
Brian
45
5
Chris
25
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
df.filter(items=['age','job'])
age
job
0
20
student
1
30
developer
2
30
teacher
3
40
dentist
4
45
manager
5
25
intern
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
컬럼 이름에 'a'가 들어가는 컬럼만 불러올 때 df.filter(like='a',axis=1) # column은 axis=1
name
age
0
John
20
1
Jenny
30
2
Nate
30
3
Julia
40
4
Brian
45
5
Chris
25
df
name
age
job
0
John
20
student
1
Jenny
30
developer
2
Nate
30
teacher
3
Julia
40
dentist
4
Brian
45
manager
5
Chris
25
intern
column 이름이 b로 끝나는 컬럼만 불러올 때 df.filter(regex='b$', axis=1)
job
0
student
1
developer
2
teacher
3
dentist
4
manager
5
intern
6_데이터프레임 행, 열 삭제하기(drop row, column) row를 drop하기 import pandas as pd
friends=[{'age':15,'job':'student'},
{'age':25,'job':'developer'},
{'age':30,'job':'teacjer'},]
df= pd.DataFrame(friends, index=['John','Jenny','Nate'], columns=['age','job'])
df
age
job
John
15
student
Jenny
25
developer
Nate
30
teacjer
df.drop(['John','Nate'])
age
job
Jenny
25
developer
df #위의 실행 코드가 df 데이터프레임에 적용되지는 않음
age
job
John
15
student
Jenny
25
developer
Nate
30
teacjer
df=df.drop(['John','Nate'])
df
age
job
Jenny
25
developer
inplace 명령어 사용해서 바로 df에 결과값 저장 df.drop(['John','Nate'], inplace= True)
df
age
job
Jenny
25
developer
인덱스에 값이 아니라 0,1,2로 되어 있는 경우 friends=[{'name':'John','age':15,'job':'student'},
{'name':'Ben','age':25,'job':'developer'},
{'name':'Jenny','age':30,'job':'teacjer'},]
df= pd.DataFrame(friends, columns=['name','age','job'])
df #인덱스에 숫자가 들어가게 됨
name
age
job
0
John
15
student
1
Ben
25
developer
2
Jenny
30
teacjer
인덱스로만 John, Jenny row 삭제 df=df.drop(df.index[[0,2]])
df
name
age
job
1
Ben
25
developer
df= pd.DataFrame(friends, columns=['name','age','job'])
df
name
age
job
0
John
15
student
1
Ben
25
developer
2
Jenny
30
teacjer
column의 value에 따라 drop하기 age가 20살 이상인 row의 정보만 불러오기 df=df[df.age>20]
df
name
age
job
1
Ben
25
developer
2
Jenny
30
teacjer
column을 drop 하는 방법 df= pd.DataFrame(friends, columns=['name','age','job'])
df
name
age
job
0
John
15
student
1
Ben
25
developer
2
Jenny
30
teacjer
df= df.drop('age',axis=1) # axis=1 컬럼 중에서 age라는 컬럼을 삭제
df
name
job
0
John
student
1
Ben
developer
2
Jenny
teacjer
inplace 명령어 사용해서 바로 df에 결과값 저장 df= pd.DataFrame(friends, columns=['name','age','job'])
df.drop('age',axis=1, inplace=True)
df #결과값을 따로 저장하지 않아도 바로
name
job
0
John
student
1
Ben
developer
2
Jenny
teacjer
7_데이터 그룹 만들기 (group by) import pandas as pd
student_list = [{'name':'John', 'major':'Computer Science', 'sex':'male'},
{'name':'Nate', 'major':'Computer Science', 'sex':'male'},
{'name':'Abraham', 'major':'Physics', 'sex':'male'},
{'name':'Brian', 'major':'Psychology', 'sex':'male'},
{'name':'Jenny', 'major':'Economics', 'sex':'female'},
{'name':'Yuna', 'major':'Economics', 'sex':'female'},
{'name':'Jeniffer', 'major':'Computer Science', 'sex':'female'},
{'name':'Edward', 'major':'Computer Science', 'sex':'male'},
{'name':'Zara', 'major':'Psychology', 'sex':'female'},
{'name':'wendy', 'major':'Economics', 'sex':'female'},
{'name':'Sera', 'major':'Psychology', 'sex':'female'}]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df
name
major
sex
0
John
Computer Science
male
1
Nate
Computer Science
male
2
Abraham
Physics
male
3
Brian
Psychology
male
4
Jenny
Economics
female
5
Yuna
Economics
female
6
Jeniffer
Computer Science
female
7
Edward
Computer Science
male
8
Zara
Psychology
female
9
wendy
Economics
female
10
Sera
Psychology
female
### 각 학과별로 몇명인지 알고 싶을 때
groupby_major = df.groupby('major')
groupby_major.groups
{'Computer Science': Int64Index([0, 1, 6, 7], dtype='int64'),
'Economics': Int64Index([4, 5, 9], dtype='int64'),
'Physics': Int64Index([2], dtype='int64'),
'Psychology': Int64Index([3, 8, 10], dtype='int64')}
for name, group in groupby_major:
print(name + " : " + str(len(group)))
print(group)
print()
Computer Science : 4
name major sex
0 John Computer Science male
1 Nate Computer Science male
6 Jeniffer Computer Science female
7 Edward Computer Science male
Economics : 3
name major sex
4 Jenny Economics female
5 Yuna Economics female
9 wendy Economics female
Physics : 1
name major sex
2 Abraham Physics male
Psychology : 3
name major sex
3 Brian Psychology male
8 Zara Psychology female
10 Sera Psychology female
df_major_cnt = pd.DataFrame( {'count' :groupby_major.size()} ).reset_index()
df_major_cnt
major
count
0
Computer Science
4
1
Economics
3
2
Physics
1
3
Psychology
3
# 성별 별로
groupby_sex = df.groupby('sex')
for name, group in groupby_sex:
print(name + " : " + str(len(group)))
print(group)
print()
female : 6
name major sex
4 Jenny Economics female
5 Yuna Economics female
6 Jeniffer Computer Science female
8 Zara Psychology female
9 wendy Economics female
10 Sera Psychology female
male : 5
name major sex
0 John Computer Science male
1 Nate Computer Science male
2 Abraham Physics male
3 Brian Psychology male
7 Edward Computer Science male
8_중복 데이터 삭제하기(drop duplicates) student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
{'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
{'name': 'Abraham', 'major': "Physics", 'sex': "male"},
{'name': 'Brian', 'major': "Psychology", 'sex': "male"},
{'name': 'Janny', 'major': "Economics", 'sex': "female"},
{'name': 'Yuna', 'major': "Economics", 'sex': "female"},
{'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
{'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
{'name': 'Zara', 'major': "Psychology", 'sex': "female"},
{'name': 'Wendy', 'major': "Economics", 'sex': "female"},
{'name': 'Sera', 'major': "Psychology", 'sex': "female"},
{'name': 'John', 'major': "Computer Science", 'sex': "male"},
]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df
name
major
sex
0
John
Computer Science
male
1
Nate
Computer Science
male
2
Abraham
Physics
male
3
Brian
Psychology
male
4
Janny
Economics
female
5
Yuna
Economics
female
6
Jeniffer
Computer Science
female
7
Edward
Computer Science
male
8
Zara
Psychology
female
9
Wendy
Economics
female
10
Sera
Psychology
female
11
John
Computer Science
male
df.duplicated() # True는 중복
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 True
dtype: bool
### 중복된 값 삭제
df.drop_duplicates() # 11번째 행 삭제
name
major
sex
0
John
Computer Science
male
1
Nate
Computer Science
male
2
Abraham
Physics
male
3
Brian
Psychology
male
4
Janny
Economics
female
5
Yuna
Economics
female
6
Jeniffer
Computer Science
female
7
Edward
Computer Science
male
8
Zara
Psychology
female
9
Wendy
Economics
female
10
Sera
Psychology
female
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
{'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
{'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
{'name': 'Zara', 'major': "Psychology", 'sex': "female"},
{'name': 'Wendy', 'major': "Economics", 'sex': "female"},
{'name': 'Nate', 'major': None, 'sex': "female"},
{'name': 'John', 'major': "Economics", 'sex': "male"},
]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df
name
major
sex
0
John
Computer Science
male
1
Nate
Computer Science
male
2
Edward
Computer Science
male
3
Zara
Psychology
female
4
Wendy
Economics
female
5
Nate
None
female
6
John
Economics
male
df.duplicated() #중복된 값이 없다고 나옴
0 False
1 False
2 False
3 False
4 False
5 False
6 False
dtype: bool
df.duplicated(['name']) #'name' 인자를 넣어 이름이 중복되면 True 값 반환
0 False
1 False
2 False
3 False
4 False
5 True
6 True
dtype: bool
### 중복된 값 삭제
df.drop_duplicates(['name'], keep = 'first' ) #앞에 있는 John,Nate는 남게 됨, default 'first'y
name
major
sex
0
John
Computer Science
male
1
Nate
Computer Science
male
2
Edward
Computer Science
male
3
Zara
Psychology
female
4
Wendy
Economics
female
### 중복된 값 삭제
df.drop_duplicates(['name'], keep = 'last' ) #뒤에 있는 John,Nate는 남게 됨
name
major
sex
2
Edward
Computer Science
male
3
Zara
Psychology
female
4
Wendy
Economics
female
5
Nate
None
female
6
John
Economics
male
9_ NaN (None) 찾아서 원하는 값으로 변경 student_id_list = [{'name': 'John', 'job': "teacher", 'age': 40},
{'name': 'Nate', 'job': "teacher", 'age': 35},
{'name': 'Yuna', 'job': "teacher", 'age': 37},
{'name': 'Abraham', 'job': "student", 'age': 10},
{'name': 'Brian', 'job': "student", 'age': 12},
{'name': 'Jenny', 'job': "student", 'age': 11},
{'name': 'Nate', 'job': "teacher", 'age': None},
{'name': 'John', 'job': "student", 'age': None}
]
df = pd.DataFrame(student_id_list, columns = ['name', 'job', 'age'])
df
name
job
age
0
John
teacher
40.0
1
Nate
teacher
35.0
2
Yuna
teacher
37.0
3
Abraham
student
10.0
4
Brian
student
12.0
5
Jenny
student
11.0
6
Nate
teacher
NaN
7
John
student
NaN
None 값 찾기 ### None 값 찾기
df.shape
(8, 3)
df.info() # age 6 non-null float64, 2개가 None 값
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
name 8 non-null object
job 8 non-null object
age 6 non-null float64
dtypes: float64(1), object(2)
memory usage: 320.0+ bytes
df.isna() #True 값이 None 값임
name
job
age
0
False
False
False
1
False
False
False
2
False
False
False
3
False
False
False
4
False
False
False
5
False
False
False
6
False
False
True
7
False
False
True
df.isnull()
name
job
age
0
False
False
False
1
False
False
False
2
False
False
False
3
False
False
False
4
False
False
False
5
False
False
False
6
False
False
True
7
False
False
True
df.age = df.age.fillna(0)
df
name
job
age
0
John
teacher
40.0
1
Nate
teacher
35.0
2
Yuna
teacher
37.0
3
Abraham
student
10.0
4
Brian
student
12.0
5
Jenny
student
11.0
6
Nate
teacher
0.0
7
John
student
0.0
student_id_list = [{'name': 'John', 'job': "teacher", 'age': 40},
{'name': 'Nate', 'job': "teacher", 'age': 35},
{'name': 'Yuna', 'job': "teacher", 'age': 37},
{'name': 'Abraham', 'job': "student", 'age': 10},
{'name': 'Brian', 'job': "student", 'age': 12},
{'name': 'Jenny', 'job': "student", 'age': 11},
{'name': 'Nate', 'job': "teacher", 'age': None},
{'name': 'John', 'job': "student", 'age': None}
]
df = pd.DataFrame(student_id_list, columns = ['name', 'job', 'age'])
df
name
job
age
0
John
teacher
40.0
1
Nate
teacher
35.0
2
Yuna
teacher
37.0
3
Abraham
student
10.0
4
Brian
student
12.0
5
Jenny
student
11.0
6
Nate
teacher
NaN
7
John
student
NaN
df['age'].fillna(df.groupby('job')['age'].transform('median'), inplace = True)
df
name
job
age
0
John
teacher
40.0
1
Nate
teacher
35.0
2
Yuna
teacher
37.0
3
Abraham
student
10.0
4
Brian
student
12.0
5
Jenny
student
11.0
6
Nate
teacher
37.0
7
John
student
11.0
10_ apply 함수 data_list = [{'yyyy-mm-dd' : '2000-06-27'},
{'yyyy-mm-dd' : '2002-09-24'},
{'yyyy-mm-dd' : '2005-12-20'},
]
df = pd.DataFrame(data_list, columns = ['yyyy-mm-dd'])
df
yyyy-mm-dd
0
2000-06-27
1
2002-09-24
2
2005-12-20
def extract_year(column):
return column.split('-')[0]
df['year'] = df['yyyy-mm-dd'].apply(extract_year)
df
yyyy-mm-dd
year
0
2000-06-27
2000
1
2002-09-24
2002
2
2005-12-20
2005
### 이번년도에서 year의 연도를 뺀 것이 age
def get_age(year, current_year):
return current_year - int(year) +1
df['age'] = df['year'].apply(get_age, current_year = 2020)
df
yyyy-mm-dd
year
age
0
2000-06-27
2000
21
1
2002-09-24
2002
19
2
2005-12-20
2005
16
### get introduce
def get_introduce(age, prefix, suffix):
return prefix + ' ' + str(age) + ' ' + suffix
df['introduce'] = df['age'].apply(get_introduce, prefix = "I am", suffix = "years old")
df
yyyy-mm-dd
year
age
introduce
0
2000-06-27
2000
21
I am 21 years old
1
2002-09-24
2002
19
I am 19 years old
2
2005-12-20
2005
16
I am 16 years old
### 여러개의 컬럼 apply
def get_introduce_2(row):
return "I was born in "+str(row.year)+ " my age is " + str(row.age)
df['introduce_2'] = df.apply(get_introduce_2, axis = 1) #열
df
yyyy-mm-dd
year
age
introduce
introduce_2
0
2000-06-27
2000
21
I am 21 years old
I was born in 2000 my age is 21
1
2002-09-24
2002
19
I am 19 years old
I was born in 2002 my age is 19
2
2005-12-20
2005
16
I am 16 years old
I was born in 2005 my age is 16
11_map, applymap 함수 data_list = [{'date' : '2000-06-27'},
{'date' : '2002-09-24'},
{'date' : '2005-12-20'},
]
df = pd.DataFrame(data_list, columns = ['date'])
df
date
0
2000-06-27
1
2002-09-24
2
2005-12-20
def extract_year(date):
return date.split('-')[0]
df['year'] = df['date'].map(extract_year)
df
date
year
0
2000-06-27
2000
1
2002-09-24
2002
2
2005-12-20
2005
### apply함수와 map 함수 사용 차이
job_list = [ {'age':20, 'job':'student'},
{'age':30, 'job':'devoloper'},
{'age':20, 'job':'teacher'},
]
df = pd.DataFrame(job_list)
df
age
job
0
20
student
1
30
devoloper
2
20
teacher
df.job = df.job.map({'student':1, 'devoloper':2, 'teacher':3})
df
age
job
0
20
1
1
30
2
2
20
3
# map 함수는 apply 함수처럼 컬럼을 변경해 줄 수 있고, 딕셔너리를 직접 전달해 원하는 값으로 바꿀 수 있음
x_y = [{'x' : 5.5, 'y': -5.6, 'z':-1.1},
{'x' : -5.2, 'y': 5.5, 'z':-2.2},
{'x' : -1.6, 'y': -4.5, 'z':-3.3}
]
df = pd.DataFrame(x_y)
df
x
y
z
0
5.5
-5.6
-1.1
1
-5.2
5.5
-2.2
2
-1.6
-4.5
-3.3
import numpy as np
# 모든 컬럼을 바꾸고 싶을 때
df = df.applymap(np.around) #반올림
df
x
y
z
0
6.0
-6.0
-1.0
1
-5.0
6.0
-2.0
2
-2.0
-4.0
-3.0
12_ unique, value_counts 컬럼 내 유니크한 값 뽑아내고 갯수 확인하기 job_list = [{'name': 'John', 'job': "teacher"},
{'name': 'Nate', 'job': "teacher"},
{'name': 'Fred', 'job': "teacher"},
{'name': 'Abraham', 'job': "student"},
{'name': 'Brian', 'job': "student"},
{'name': 'Jenny', 'job': "devoloper"},
{'name': 'Nate', 'job': "teacher"},
{'name': 'Ian', 'job': "teacher"},
{'name': 'Chris', 'job': "banker"},
{'name': 'Philip', 'job': "lawyer"},
{'name': 'Janny', 'job': "basketball player"},
{'name': 'Gwen', 'job': "teacher"},
{'name': 'Jessy', 'job': "student"}
]
df = pd.DataFrame(job_list, columns = ['name', 'job'])
df
name
job
0
John
teacher
1
Nate
teacher
2
Fred
teacher
3
Abraham
student
4
Brian
student
5
Jenny
devoloper
6
Nate
teacher
7
Ian
teacher
8
Chris
banker
9
Philip
lawyer
10
Janny
basketball player
11
Gwen
teacher
12
Jessy
student
### unique한 value 뽑을 때
df.job.unique() # class에 있는 값들을 하나씩만 보여줌
array(['teacher', 'student', 'devoloper', 'banker', 'lawyer',
'basketball player'], dtype=object)
### 각 직업별로 몇개의 데이터가 있는지
df.job.value_counts()
teacher 6
student 3
basketball player 1
devoloper 1
banker 1
lawyer 1
Name: job, dtype: int64
13_두 개의 데이터프레임 합치기 (concat, append) l1 = [{'name': 'John', 'job': "teacher"},
{'name': 'Nate', 'job': "student"},
{'name': 'Fred', 'job': "developer"}]
l2 = [{'name': 'Ed', 'job': "dentist"},
{'name': 'Jack', 'job': "farmer"},
{'name': 'Ted', 'job': "designer"}]
l3 = [{'name': 'John', 'job': "teacher"},
{'name': 'Nate', 'job': "student"},
{'name': 'Fred', 'job': "developer"}]
l4 = [{'age': 25, 'country': "U.S"},
{'age': 30, 'country': "U.K"},
{'age': 45, 'country': "Korea"}]
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])
df1
name
job
0
John
teacher
1
Nate
student
2
Fred
developer
df2
name
job
0
Ed
dentist
1
Jack
farmer
2
Ted
designer
result = pd.concat([df1,df2], ignore_index = True)
result
name
job
0
John
teacher
1
Nate
student
2
Fred
developer
3
Ed
dentist
4
Jack
farmer
5
Ted
designer
result = df1.append(df2, ignore_index = True)
result
name
job
0
John
teacher
1
Nate
student
2
Fred
developer
0
Ed
dentist
1
Jack
farmer
2
Ted
designer
import pandas as pd
df3 = pd.DataFrame(l3, columns = ['name', 'job'])
df4 = pd.DataFrame(l4, columns = ['age', 'country'])
df3
name
job
0
John
teacher
1
Nate
student
2
Fred
developer
df4
age
country
0
25
U.S
1
30
U.K
2
45
Korea
result = pd.concat([df3, df4], axis = 1, ignore_index = True) # axis=1, 열
result
0
1
2
3
0
John
teacher
25
U.S
1
Nate
student
30
U.K
2
Fred
developer
45
Korea
label = [1,2,3,4,5]
prediction = [1,2,2,4,4]
comparison = pd.DataFrame({'label': label, 'prediction': prediction})
comparison
label
prediction
0
1
1
1
2
2
2
3
2
3
4
4
4
5
4