-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBzdawka_Igor_assignment_2.py
117 lines (101 loc) · 6.52 KB
/
Bzdawka_Igor_assignment_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
### Data Processing in R and Python 2022Z
### Homework Assignment no. 2
###
### IMPORTANT
### This file should contain only solutions to tasks in the form of a functions
### definitions and comments to the code.
###
#
import pandas as pd
import numpy as np
# -----------------------------------------------------------------------------#
# Task 1
# -----------------------------------------------------------------------------#
def solution_1(Posts):
result = Posts
result['Year'] = result['CreationDate']
result['Year'] = pd.to_datetime(result['Year'], errors='coerce').dt.strftime('%Y')
result = result.groupby('Year', as_index=False)['Id'].count().rename(columns={"Id": "TotalNumber"})
return result
# -----------------------------------------------------------------------------#
# Task 2
# -----------------------------------------------------------------------------#
def solution_2(Users, Posts):
us = Users[['Id', 'DisplayName']]
questions = Posts[Posts['PostTypeId'] == 1][['OwnerUserId', 'ViewCount']]
result = pd.merge(left=us, right=questions, left_on='Id', right_on='OwnerUserId')[
['Id', 'DisplayName', 'ViewCount']].groupby(['Id', 'DisplayName'], as_index=False)['ViewCount'].sum().rename(
columns={"ViewCount": "TotalViews"}).sort_values(by='TotalViews', ascending=False).head(10)
result.index = range(0, len(result.index))
return result
# -----------------------------------------------------------------------------#
# Task 3
# -----------------------------------------------------------------------------#
def solution_3(Badges):
bn = Badges[['Name', 'Date']]
BadgesNames = bn.loc[:].copy()
BadgesNames.loc[:, 'Year'] = BadgesNames.apply(lambda row: pd.to_datetime(row.Date, errors='coerce').strftime('%Y'),
axis=1)
BadgesNames = BadgesNames[['Name', 'Year']].groupby(['Name', 'Year']).size().reset_index(name="Count")
BadgesYearly = BadgesNames.groupby('Year')['Count'].sum().reset_index(name="CountTotal")
NoName = pd.merge(left=BadgesNames, right=BadgesYearly, left_on='Year', right_on='Year')
NameAndCount = NoName[['Name', 'Count', 'Year']]
NoName = NoName.groupby(['Year', 'CountTotal'], as_index=False)['Count'].max()
result = pd.merge(left=NoName, right=NameAndCount, left_on=['Year', 'Count'], right_on=['Year', 'Count'],
how='left')
result.loc[:, 'MaxPercentage'] = result.apply(lambda row: row.Count / row.CountTotal, axis=1)
result = result[['Year', 'Name', 'MaxPercentage']]
return result
# -----------------------------------------------------------------------------#
# Task 4
# -----------------------------------------------------------------------------#
def solution_4(Comments, Posts, Users):
CmtTotScr = Comments[['PostId', 'Score']].groupby('PostId', as_index=False)['Score'].sum().rename(
columns={"Score": "CommentsTotalScore"})
PostsBestComments = pd.merge(left=CmtTotScr, right=Posts, left_on="PostId", right_on="Id")
PostsBestComments = PostsBestComments[PostsBestComments['PostTypeId'] == 1][
['OwnerUserId', 'Title', 'CommentCount', 'ViewCount', 'CommentsTotalScore']]
PostsBestComments.index = range(0, len(PostsBestComments.index))
result = pd.merge(left=PostsBestComments, right=Users, left_on="OwnerUserId", right_on="Id")[
['Title', 'CommentCount', 'ViewCount', 'CommentsTotalScore', 'DisplayName', 'Reputation',
'Location']].sort_values(by='CommentsTotalScore', ascending=False).head(10)
result.index = range(0, len(result.index))
result = result.where(pd.notnull(result), None)
return result
# -----------------------------------------------------------------------------#
# Task 5
# -----------------------------------------------------------------------------#
def solution_5(Posts, Votes):
VotesDates = Votes[Votes['VoteTypeId'].isin([3, 4, 12])][['PostId', 'CreationDate']]
VotesDates.loc[:, 'Year'] = VotesDates.apply(
lambda row: pd.to_datetime(row.CreationDate, errors='coerce').strftime('%Y'), axis=1)
VotesDates.loc[:, 'VoteDate'] = VotesDates.apply(lambda row: np.where(int(row.Year) == 2022, 'after',
np.where(int(row.Year) == 2021, 'during',
np.where(int(row.Year) == 2020,
'during', np.where(
int(row.Year) == 2019,
'during', 'before')))),
axis=1)
VotesDates = VotesDates[['PostId', 'VoteDate']]
VotesDates['VoteDate'] = VotesDates['VoteDate'].astype('string')
VotesDates = VotesDates.groupby(['PostId', 'VoteDate']).size().reset_index(name="Total")
VotesByAge = VotesDates
VotesByAge['BeforeCOVIDVotes'] = VotesByAge.apply(lambda row: np.where(row.VoteDate == 'before', row.Total, 0),
axis=1).astype('int64')
VotesByAge['DuringCOVIDVotes'] = VotesByAge.apply(lambda row: np.where(row.VoteDate == 'during', row.Total, 0),
axis=1).astype('int64')
VotesByAge['AfterCOVIDVotes'] = VotesByAge.apply(lambda row: np.where(row.VoteDate == 'after', row.Total, 0),
axis=1).astype('int64')
VotesByAge = VotesByAge.groupby('PostId', as_index=False)[
['BeforeCOVIDVotes', 'DuringCOVIDVotes', 'AfterCOVIDVotes']].max()
VotesByAge['Votes'] = VotesByAge.apply(
lambda row: row.BeforeCOVIDVotes + row.DuringCOVIDVotes + row.AfterCOVIDVotes, axis=1)
result = pd.merge(left=Posts, right=VotesByAge, left_on="Id", right_on="PostId")
result = result[(~(result['Title'].isnull())) & (result['DuringCOVIDVotes'] > 0)][
['Title', 'CreationDate', 'PostId', 'BeforeCOVIDVotes', 'DuringCOVIDVotes', 'AfterCOVIDVotes',
'Votes']].sort_values(by=['DuringCOVIDVotes', 'Votes'], ascending=[False, False]).head(20)
result.loc[:, 'Date'] = result.apply(
lambda row: pd.to_datetime(row.CreationDate, errors='coerce').strftime('%Y-%m-%d'), axis=1)
result = result[['Title', 'Date', 'PostId', 'BeforeCOVIDVotes', 'DuringCOVIDVotes', 'AfterCOVIDVotes', 'Votes']]
result.index = range(0, len(result.index))
return result