diff --git a/Notebooks/02_data_wrangling.ipynb b/Notebooks/02_data_wrangling.ipynb index a52eb6c24..8080982be 100644 --- a/Notebooks/02_data_wrangling.ipynb +++ b/Notebooks/02_data_wrangling.ipynb @@ -126,9 +126,9 @@ "source": [ "#Code task 1#\n", "#Import pandas, matplotlib.pyplot, and seaborn in the correct lines below\n", - "import ___ as pd\n", - "import ___ as plt\n", - "import ___ as sns\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", "import os\n", "\n", "from library.sb_utils import save_file\n" @@ -185,7 +185,7 @@ "source": [ "#Code task 2#\n", "#Call the info method on ski_data to see a summary of the data\n", - "ski_data.___" + "ski_data.info()" ] }, { @@ -212,7 +212,7 @@ "source": [ "#Code task 3#\n", "#Call the head method on ski_data to print the first several rows of the data\n", - "ski_data.___" + "ski_data.head()" ] }, { @@ -253,7 +253,7 @@ "#Filter the ski_data dataframe to display just the row for our resort with the name 'Big Mountain Resort'\n", "#Hint: you will find that the transpose of the row will give a nicer output. DataFrame's do have a\n", "#transpose method, but you can access this conveniently with the `T` property.\n", - "ski_data[ski_data.Name == ___].___" + "ski_data[ski_data.Name == 'Big Mountain Resort'].transpose()" ] }, { @@ -288,9 +288,9 @@ "#ski_data as well as the percentages (using `.mean()` instead of `.sum()`).\n", "#Order them (increasing or decreasing) using sort_values\n", "#Call `pd.concat` to present these in a single table (DataFrame) with the helpful column names 'count' and '%'\n", - "missing = ___([ski_data.___.___, 100 * ski_data.___.___], axis=1)\n", - "missing.columns=[___, ___]\n", - "missing.___(by=___)" + "missing = ski_data([ski_data.pd.concat, 100 * ski_data.pd.concat], axis=1)\n", + "missing.columns=['count', '%']\n", + "missing.sum(by=ski_data.isnull())" ] }, { @@ -322,7 +322,7 @@ "source": [ "#Code task 6#\n", "#Use ski_data's `select_dtypes` method to select columns of dtype 'object'\n", - "ski_data.___(___)" + "ski_data.select_dtypes('object')" ] }, { @@ -350,7 +350,7 @@ "source": [ "#Code task 7#\n", "#Use pandas' Series method `value_counts` to find any duplicated resort names\n", - "ski_data['Name'].___.head()" + "ski_data['Name'].value_counts.head()" ] }, { @@ -375,7 +375,7 @@ "source": [ "#Code task 8#\n", "#Concatenate the string columns 'Name' and 'Region' and count the values again (as above)\n", - "(ski_data[___] + ', ' + ski_data[___]).___.head()" + "(ski_data['Name'] + ', ' + ski_data['Region']).value_counts.head()" ] }, { @@ -386,7 +386,7 @@ "source": [ "#Code task 9#\n", "#Concatenate 'Name' and 'state' and count the values again (as above)\n", - "(ski_data[___] + ', ' + ski_data[___]).___.head()" + "(ski_data['Name'] + ', ' + ski_data['state']).value_counts.head()" ] }, { @@ -577,7 +577,7 @@ "source": [ "#Code task 10#\n", "#Calculate the number of times Region does not equal state\n", - "(ski_data.Region ___ ski_data.state).___" + "(ski_data.Region != ski_data.state).sum()" ] }, { @@ -661,8 +661,8 @@ "#Code task 11#\n", "#Filter the ski_data dataframe for rows where 'Region' and 'state' are different,\n", "#group that by 'state' and perform `value_counts` on the 'Region'\n", - "(ski_data[ski_data.___ ___ ski_data.___]\n", - " .groupby(___)[___]\n", + "(ski_data[ski_data.sort != ski_data.rows]\n", + " .groupby('state')['Region']\n", " .value_counts())" ] }, @@ -689,7 +689,7 @@ "#Code task 12#\n", "#Select the 'Region' and 'state' columns from ski_data and use the `nunique` method to calculate\n", "#the number of unique values in each\n", - "ski_data[[___, ___]].___" + "ski_data[['Region', 'state']].nunique()" ] }, { @@ -721,21 +721,21 @@ "source": [ "#Code task 13#\n", "#Create two subplots on 1 row and 2 columns with a figsize of (12, 8)\n", - "fig, ax = plt.subplots(___, ___, figsize=(___))\n", + "fig, ax = plt.subplots(row=1, col=2, figsize=(12, 8))\n", "#Specify a horizontal barplot ('barh') as kind of plot (kind=)\n", - "ski_data.Region.value_counts().plot(kind=___, ax=ax[0])\n", + "ski_data.Region.value_counts().plot(kind='barh', ax=ax[0])\n", "#Give the plot a helpful title of 'Region'\n", - "ax[0].set_title(___)\n", + "ax[0].set_title('Region')\n", "#Label the xaxis 'Count'\n", - "ax[0].set_xlabel(___)\n", + "ax[0].set_xlabel('Count')\n", "#Specify a horizontal barplot ('barh') as kind of plot (kind=)\n", - "ski_data.state.value_counts().plot(kind=___, ax=ax[1])\n", + "ski_data.state.value_counts().plot(kind='barh', ax=ax[1])\n", "#Give the plot a helpful title of 'state'\n", - "ax[1].set_title(___)\n", + "ax[1].set_title('state')\n", "#Label the xaxis 'Count'\n", - "ax[1].set_xlabel(___)\n", + "ax[1].set_xlabel('Count')\n", "#Give the subplots a little \"breathing room\" with a wspace of 0.5\n", - "plt.subplots_adjust(wspace=___);\n", + "plt.subplots_adjust(wspace=0.5);\n", "#You're encouraged to explore a few different figure sizes, orientations, and spacing here\n", "# as the importance of easy-to-read and informative figures is frequently understated\n", "# and you will find the ability to tweak figures invaluable later on" @@ -778,7 +778,7 @@ "#Code task 14#\n", "# Calculate average weekday and weekend price by state and sort by the average of the two\n", "# Hint: use the pattern dataframe.groupby()[].mean()\n", - "state_price_means = ski_data.___(___)[[___, ___]].mean()\n", + "state_price_means = ski_data.groupby('price')[['weekday', 'weekend']].mean()\n", "state_price_means.head()" ] }, @@ -849,11 +849,11 @@ "#gather the ticket prices from the 'Adultweekday' and 'AdultWeekend' columns using the `value_vars` argument,\n", "#call the resultant price column 'Price' via the `value_name` argument,\n", "#name the weekday/weekend indicator column 'Ticket' via the `var_name` argument\n", - "ticket_prices = pd.melt(ski_data[[___, ___, ___]], \n", - " id_vars=___, \n", - " var_name=___, \n", - " value_vars=[___, ___], \n", - " value_name=___)" + "ticket_prices = pd.melt(ski_data[['state', 'AdultWeekday', 'Adultweekend']], \n", + " id_vars='state', \n", + " var_name='Ticket', \n", + " value_vars=['Adultweekday', 'AdultWeekend'], \n", + " value_name='Price')" ] }, { @@ -958,7 +958,7 @@ "#with 'state' on the x-axis, 'Price' as the y-value, and a hue that indicates 'Ticket'\n", "#This will use boxplot's x, y, hue, and data arguments.\n", "plt.subplots(figsize=(12, 8))\n", - "sns.boxplot(x=___, y=___, hue=___, data=ticket_prices)\n", + "sns.boxplot(x='state', y='Price', hue='Ticket', data=ticket_prices)\n", "plt.xticks(rotation='vertical')\n", "plt.ylabel('Price ($)')\n", "plt.xlabel('State');" @@ -1020,7 +1020,7 @@ "#Call ski_data's `describe` method for a statistical summary of the numerical columns\n", "#Hint: there are fewer summary stat columns than features, so displaying the transpose\n", "#will be useful again\n", - "ski_data.___.___" + "ski_data.describe.transpose()" ] }, { @@ -1086,8 +1086,8 @@ "#Try passing it an argument figsize=(15,10)\n", "#Try calling plt.subplots_adjust() with an argument hspace=0.5 to adjust the spacing\n", "#It's important you create legible and easy-to-read plots\n", - "ski_data.___(___)\n", - "#plt.subplots_adjust(hspace=___);\n", + "ski_data.hist(figsize=(15,10))\n", + "#plt.subplots_adjust(hspace=0.5);\n", "#Hint: notice how the terminating ';' \"swallows\" some messy output and leads to a tidier notebook" ] }, @@ -1120,7 +1120,7 @@ "source": [ "#Code task 19#\n", "#Filter the 'SkiableTerrain_ac' column to print the values greater than 10000\n", - "ski_data.___[ski_data.___ > ___]" + "ski_data.sort_values[ski_data.column['SkiableTerrain_ac'] > 10000]" ] }, { @@ -1139,7 +1139,7 @@ "#Code task 20#\n", "#Now you know there's only one, print the whole row to investigate all values, including seeing the resort name\n", "#Hint: don't forget the transpose will be helpful here\n", - "ski_data[ski_data.___ > ___].___" + "ski_data[ski_data.column('SkiableTerrain_ac') > 0].transpose()" ] }, { @@ -1185,7 +1185,7 @@ "source": [ "#Code task 21#\n", "#Use the .loc accessor to print the 'SkiableTerrain_ac' value only for this resort\n", - "ski_data.___[39, 'SkiableTerrain_ac']" + "ski_data.loc[39, 'SkiableTerrain_ac']" ] }, { @@ -1196,7 +1196,7 @@ "source": [ "#Code task 22#\n", "#Use the .loc accessor again to modify this value with the correct value of 1819\n", - "ski_data.___[39, 'SkiableTerrain_ac'] = ___" + "ski_data.loc[39, 'SkiableTerrain_ac'] = 1819" ] }, { @@ -1207,7 +1207,7 @@ "source": [ "#Code task 23#\n", "#Use the .loc accessor a final time to verify that the value has been modified\n", - "ski_data.___[39, 'SkiableTerrain_ac']" + "ski_data.loc[39, 'SkiableTerrain_ac']" ] }, { @@ -1559,7 +1559,7 @@ "source": [ "#Code task 24#\n", "#Drop the 'fastEight' column from ski_data. Use inplace=True\n", - "ski_data.drop(columns=___, inplace=___)" + "ski_data.drop(columns='fastEight', inplace=True)" ] }, { @@ -1577,7 +1577,7 @@ "source": [ "#Code task 25#\n", "#Filter the 'yearsOpen' column for values greater than 100\n", - "ski_data.___[ski_data.___ > ___]" + "ski_data.loc[ski_data.column['yearsOpen'] > 100]" ] }, { @@ -1603,7 +1603,7 @@ "#Code task 26#\n", "#Call the hist method on 'yearsOpen' after filtering for values under 1000\n", "#Pass the argument bins=30 to hist(), but feel free to explore other values\n", - "ski_data.___[ski_data.___ < ___].hist(___)\n", + "ski_data.loc[ski_data.column['yearsOpen'] < 1000].hist(bins=30)\n", "plt.xlabel('Years open')\n", "plt.ylabel('Count')\n", "plt.title('Distribution of years open excluding 2019');" @@ -1730,13 +1730,13 @@ "#respectively\n", "#Finally, add a call to the reset_index() method (we recommend you experiment with and without this to see\n", "#what it does)\n", - "state_summary = ski_data.groupby('state').agg(\n", + "c " resorts_per_state=pd.NamedAgg(column='Name', aggfunc='size'), #could pick any column here\n", " state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),\n", - " state_total_days_open=pd.NamedAgg(column=__, aggfunc='sum'),\n", - " ___=pd.NamedAgg(column=___, aggfunc=___),\n", - " ___=pd.NamedAgg(column=___, aggfunc=___)\n", - ").___\n", + " state_total_days_open=pd.NamedAgg(column='state, aggfunc='sum'),\n", + " state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),\n", + " state_total_days_open=pd.NamedAgg(column='state, aggfunc='sum'),\n", + ")state_summary = ski_data.groupby('state').agg(\n", "state_summary.head()" ] }, @@ -1856,7 +1856,7 @@ "#Code task 29#\n", "#Use pandas' `read_html` method to read the table from the URL below\n", "states_url = 'https://simple.wikipedia.org/w/index.php?title=List_of_U.S._states&oldid=7168473'\n", - "usa_states = pd.___(___)" + "usa_states = pd.read_html(states_url)" ] }, { @@ -2088,7 +2088,7 @@ "#Code task 30#\n", "#Use the iloc accessor to get the pandas Series for column number 4 from `usa_states`\n", "#It should be a column of dates\n", - "established = usa_sates.___[:, 4]" + "established = usa_sates.iloc[:, 4]" ] }, { @@ -2178,8 +2178,8 @@ "#Now use the iloc accessor again to extract columns 0, 5, and 6 and the dataframe's `copy()` method\n", "#Set the names of these extracted columns to 'state', 'state_population', and 'state_area_sq_miles',\n", "#respectively.\n", - "usa_states_sub = usa_states.___[:, [___]].copy()\n", - "usa_states_sub.columns = [___]\n", + "usa_states_sub = usa_states.iloc[:, [4:6]].copy()\n", + "usa_states_sub.columns = ['state', 'state_population', 'state_area_sq_miles']\n", "usa_states_sub.head()" ] }, @@ -2199,7 +2199,7 @@ "#Code task 32#\n", "#Find the states in `state_summary` that are not in `usa_states_sub`\n", "#Hint: set(list1) - set(list2) is an easy way to get items in list1 that are not in list2\n", - "missing_states = ___(state_summary.state) - ___(usa_states_sub.state)\n", + "missing_states = states(state_summary.state) - states(usa_states_sub.state)\n", "missing_states" ] }, @@ -2262,7 +2262,7 @@ "#value='' #empty string as replacement\n", "#regex=True #we used a regex in our `to_replace` argument\n", "#inplace=True #Do this \"in place\"\n", - "usa_states_sub.state.___(to_replace=___, value=__, regex=___, inplace=___)\n", + "usa_states_sub.state.replace(to_replace='\\[.*\\]', value='', regex=True, inplace=True)\n", "usa_states_sub.state[usa_states_sub.state.str.contains('Massachusetts|Pennsylvania|Rhode Island|Virginia')]" ] }, @@ -2275,7 +2275,7 @@ "#Code task 34#\n", "#And now verify none of our states are missing by checking that there are no states in\n", "#state_summary that are not in usa_states_sub (as earlier using `set()`)\n", - "missing_states = ___(state_summary.state) - ___(usa_states_sub.state)\n", + "missing_states = states(state_summary.state) - states(usa_states_sub.state)\n", "missing_states" ] }, @@ -2295,7 +2295,7 @@ "#Code task 35#\n", "#Use 'state_summary's `merge()` method to combine our new data in 'usa_states_sub'\n", "#specify the arguments how='left' and on='state'\n", - "state_summary = state_summary.___(usa_states_sub, ___=___, ___=___)\n", + "state_summary = state_summary.merge(usa_states_sub, how='left', on='state')\n", "state_summary.head()" ] }, @@ -2329,7 +2329,7 @@ "#Code task 36#\n", "#Use ski_data's `plot()` method to create a scatterplot (kind='scatter') with 'AdultWeekday' on the x-axis and\n", "#'AdultWeekend' on the y-axis\n", - "ski_data.___(x=___, y=___, kind=___);" + "ski_data.plot(x='AdultWeekday', y='AdultWeekend', kind='scatter');" ] }, { @@ -2347,7 +2347,7 @@ "source": [ "#Code task 37#\n", "#Use the loc accessor on ski_data to print the 'AdultWeekend' and 'AdultWeekday' columns for Montana only\n", - "ski_data.___[ski_data.state == ___, [___, ___]]" + "ski_data.loc[ski_data.state == Montana, ['AdultWeekend', 'AdultWeekday']]" ] }, {