{ "cells": [ { "cell_type": "markdown", "id": "faa98ef5", "metadata": {}, "source": [ "# Introduction to Python" ] }, { "cell_type": "code", "execution_count": null, "id": "0034e66e", "metadata": {}, "outputs": [], "source": [ "# Calculator\n", "a = (50 + 1.45) / 12.5 # (CTRL + ENTER)\n", "b = \"hello\" + \" \" + \"world\"\n", "\n", "# print\n", "print(a)\n", "print(b)" ] }, { "cell_type": "code", "execution_count": null, "id": "6c3cd3e4", "metadata": {}, "outputs": [], "source": [ "# Lists\n", "x = [3, \"hello\", [3, 4]]\n", "y = [1, 2, 3]\n", "print(x + y)" ] }, { "cell_type": "code", "execution_count": null, "id": "9b66afe9", "metadata": {}, "outputs": [], "source": [ "# Flow control\n", "for i in range(10):\n", " print(i)" ] }, { "cell_type": "code", "execution_count": null, "id": "2c4cd1eb", "metadata": {}, "outputs": [], "source": [ "for i in x:\n", " print(i)" ] }, { "cell_type": "code", "execution_count": null, "id": "37ba634c", "metadata": {}, "outputs": [], "source": [ "i = 0\n", "while i < 10:\n", " print(i)\n", " i += 1" ] }, { "cell_type": "code", "execution_count": null, "id": "6b37c580", "metadata": {}, "outputs": [], "source": [ "# Functions\n", "def plus(a, b):\n", " return a + b\n", "\n", "plus(1, 2)" ] }, { "cell_type": "markdown", "id": "f7930b02", "metadata": {}, "source": [ "## Numpy" ] }, { "cell_type": "code", "execution_count": null, "id": "6e21da6a", "metadata": {}, "outputs": [], "source": [ "# Install library (Jupyter 7.3 or later)\n", "# %pip install numpy " ] }, { "cell_type": "code", "execution_count": null, "id": "d8f476ba", "metadata": {}, "outputs": [], "source": [ "# Older versions\n", "# import sys\n", "# !{sys.executable} -m pip install numpy" ] }, { "cell_type": "code", "execution_count": null, "id": "1da857f2", "metadata": {}, "outputs": [], "source": [ "# Import library\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "id": "ba2a6c3b", "metadata": {}, "outputs": [], "source": [ "# Numpy arrays\n", "x = np.array([3, 4, 5])\n", "y = np.array([4, 9, 7])\n", "\n", "print(x)\n", "print(y)" ] }, { "cell_type": "code", "execution_count": null, "id": "4bc27cf4", "metadata": {}, "outputs": [], "source": [ "# Numpy allows mathematical operations between arrays\n", "print(x+y)" ] }, { "cell_type": "code", "execution_count": null, "id": "171fae9b", "metadata": {}, "outputs": [], "source": [ "# Different from normal lists\n", "x = [3,4,5]\n", "y = [4,9,7]\n", "print(x+y)" ] }, { "cell_type": "code", "execution_count": null, "id": "4174aa17", "metadata": {}, "outputs": [], "source": [ "# 2-dimensional arrays\n", "\n", "a = np.array([[1, 2], [3, 4]])\n", "b = np.array([[5, 6], [7, 8]])\n", "\n", "print(a*b)\n", "\n", "a.dtype" ] }, { "cell_type": "code", "execution_count": null, "id": "750cd7c8", "metadata": {}, "outputs": [], "source": [ "# Documentation\n", "np.array?" ] }, { "cell_type": "code", "execution_count": null, "id": "f121dc20", "metadata": {}, "outputs": [], "source": [ "# Array dimensions\n", "print(a.ndim) # Number of dimensions\n", "print(a.shape) # Shape of the array" ] }, { "cell_type": "code", "execution_count": null, "id": "734e4cad", "metadata": {}, "outputs": [], "source": [ "# Reshaping arrays\n", "a = np.array([[1, 2], [3, 4]])\n", "print(a.reshape([1,4])) # [rows, columns]" ] }, { "cell_type": "code", "execution_count": null, "id": "01f0c1be", "metadata": {}, "outputs": [], "source": [ "c = np.array([1, 2, 3, 4, 5, 6])\n", "print(c.reshape(2, 3)) # Convert a 1d array to a 2x3 array" ] }, { "cell_type": "markdown", "id": "ded17360", "metadata": {}, "source": [ "## Indexing Arrays" ] }, { "cell_type": "code", "execution_count": null, "id": "843ba712", "metadata": {}, "outputs": [], "source": [ "A = np.array(np.arange(16)).reshape((4, 4))\n", "A" ] }, { "cell_type": "code", "execution_count": null, "id": "f1c07625", "metadata": {}, "outputs": [], "source": [ "# Select single element\n", "\n", "print(A[1,2]) #[row, column]\n", "# or\n", "print(A[1][2])" ] }, { "cell_type": "code", "execution_count": null, "id": "a346a413", "metadata": {}, "outputs": [], "source": [ "# Select entire row/column\n", "print(A[0,:]) # : selects everything\n", "\n", "print(A[0])\n", "\n", "print(A[:,0])" ] }, { "cell_type": "code", "execution_count": null, "id": "3c8ec982", "metadata": {}, "outputs": [], "source": [ "# Select multiple rows/columns\n", "print(A[[1, 2],:]) # 2nd and 3rd row\n", "print(A[:, [2,3]]) # 3rd and 4th column" ] }, { "cell_type": "code", "execution_count": null, "id": "970c8bcf", "metadata": {}, "outputs": [], "source": [ "# Select a submatrix\n", "print(A[[0,1], [0,1]]) # Doesn't work, selects individual elements [0,0] and [1,1]" ] }, { "cell_type": "code", "execution_count": null, "id": "b44ff2c3", "metadata": {}, "outputs": [], "source": [ "print(A[[0,1]][:,[0,1]]) # Select the rows first, then the columns" ] }, { "cell_type": "code", "execution_count": null, "id": "7169f2f9", "metadata": {}, "outputs": [], "source": [ "print(A[0:2, 0:2]) # Or with slices" ] }, { "cell_type": "markdown", "id": "13109eb0", "metadata": {}, "source": [ "## Boolean indexing" ] }, { "cell_type": "code", "execution_count": null, "id": "8bde8681", "metadata": {}, "outputs": [], "source": [ "A = [0,1,2,3,4,5,6,7,8,9]\n", "B = [True, False, True, False, True, False, True, False, True, False]" ] }, { "cell_type": "code", "execution_count": null, "id": "e8f76526", "metadata": {}, "outputs": [], "source": [ "A[B] # Error - This does not work with python arrays" ] }, { "cell_type": "code", "execution_count": null, "id": "cc641cfc", "metadata": {}, "outputs": [], "source": [ "# With numpy arrays, we select every element, where B == True\n", "A = np.array(A)\n", "print(A[B])" ] }, { "cell_type": "code", "execution_count": null, "id": "af5c5675", "metadata": {}, "outputs": [], "source": [ "# We can use boolean indexing to select elements matching specific conditions\n", "print(A[A > 5])\n", "print(A[A % 2 == 0])\n", "print(A[(A > 2) & (A < 7)])" ] }, { "cell_type": "code", "execution_count": null, "id": "42f640d4", "metadata": {}, "outputs": [], "source": [ "# Also works with n-dimensional arrays\n", "A = np.array(np.arange(16)).reshape((4, 4))\n", "print(A)\n", "rows = [True, False, True, False]\n", "cols = [True, True, False, False]\n", "A[rows][:,cols]" ] }, { "cell_type": "code", "execution_count": null, "id": "43f3ac10", "metadata": {}, "outputs": [], "source": [ "# Select rows that sum up to more than 25\n", "high_rows = np.sum(A, axis=1) > 25\n", "print(high_rows)\n", "A[high_rows]" ] }, { "cell_type": "markdown", "id": "234d97d7", "metadata": {}, "source": [ "# Pandas and Datasets" ] }, { "cell_type": "code", "execution_count": null, "id": "daed4c11", "metadata": {}, "outputs": [], "source": [ "# In addition to Numpy arrays, we can use additional libraries to better process larger datasets\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "b86ef5ad", "metadata": {}, "outputs": [], "source": [ "auto = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')\n", "auto" ] }, { "cell_type": "markdown", "id": "4749e43d", "metadata": {}, "source": [ "## Pandas dataframes" ] }, { "cell_type": "code", "execution_count": null, "id": "286249cc", "metadata": {}, "outputs": [], "source": [ "# How many examples are in a dataframe\n", "len(auto)" ] }, { "cell_type": "code", "execution_count": null, "id": "e2c17db7", "metadata": {}, "outputs": [], "source": [ "# Print column names\n", "auto.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "0a19bcb3", "metadata": {}, "outputs": [], "source": [ "# Are there missing values in the dataset\n", "any(auto.isna())" ] }, { "cell_type": "code", "execution_count": null, "id": "66ab1a52", "metadata": {}, "outputs": [], "source": [ "# Count the missing values per column\n", "auto.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "1649285b", "metadata": {}, "outputs": [], "source": [ "# Drop missing values\n", "auto = auto.dropna()\n", "print(len(auto))\n", "print(auto.isna().sum())" ] }, { "cell_type": "code", "execution_count": null, "id": "8f21a7ad", "metadata": {}, "outputs": [], "source": [ "# Drop columns\n", "auto.drop(columns=\"horsepower\")" ] }, { "cell_type": "markdown", "id": "1abfc5f7", "metadata": {}, "source": [ "### Indexing Pandas datasets" ] }, { "cell_type": "code", "execution_count": null, "id": "aee51e9f", "metadata": {}, "outputs": [], "source": [ "auto[1] # Error, cannot index by numbers " ] }, { "cell_type": "code", "execution_count": null, "id": "db7fb858", "metadata": {}, "outputs": [], "source": [ "print(auto['mpg']) # Index by column names instead" ] }, { "cell_type": "code", "execution_count": null, "id": "e969e76d", "metadata": {}, "outputs": [], "source": [ "print(auto[:3]) # Or use slices to select rows" ] }, { "cell_type": "code", "execution_count": null, "id": "431d5eff", "metadata": {}, "outputs": [], "source": [ "print(auto.loc[0,\"mpg\"]) # Or with .loc[row_numebr, column_name]" ] }, { "cell_type": "code", "execution_count": null, "id": "ddb061ed", "metadata": {}, "outputs": [], "source": [ "print(auto.iloc[0,0]) # Or with .iloc[row_number, column_number]" ] }, { "cell_type": "code", "execution_count": null, "id": "99bd5b0a", "metadata": {}, "outputs": [], "source": [ "print(auto.loc[0:5, [\"mpg\", \"cylinders\"]]) # Can also select multiple rows and columns" ] }, { "cell_type": "markdown", "id": "692f0567", "metadata": {}, "source": [ "We can use logical indexing to select specific rows" ] }, { "cell_type": "code", "execution_count": null, "id": "46002a64", "metadata": {}, "outputs": [], "source": [ "idx_80 = auto['model_year'] > 80\n", "print(auto.loc[idx_80, ['name', 'model_year']])" ] }, { "cell_type": "code", "execution_count": null, "id": "da464d95", "metadata": {}, "outputs": [], "source": [ "# Or in one line\n", "print(auto.loc[auto['model_year'] > 80, ['name', 'model_year']])" ] }, { "cell_type": "markdown", "id": "7c5728ef", "metadata": {}, "source": [ "## Adding values" ] }, { "cell_type": "code", "execution_count": null, "id": "73dae125", "metadata": {}, "outputs": [], "source": [ "# Create new column\n", "auto['Horsepower / Weight'] = auto['horsepower'] / auto['weight']\n", "auto" ] }, { "cell_type": "code", "execution_count": null, "id": "e6a64513", "metadata": {}, "outputs": [], "source": [ "print(auto.columns)\n", "# Create new (empty) row\n", "auto.append(pd.Series([None,None,None,None,None,None,None,None, None, None], index = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',\n", " 'acceleration', 'model_year', 'origin', 'name', 'Horsepower / Weight']), ignore_index=True)" ] }, { "cell_type": "markdown", "id": "0717e538", "metadata": {}, "source": [ "## Additional functions" ] }, { "cell_type": "code", "execution_count": null, "id": "d74d579f", "metadata": {}, "outputs": [], "source": [ "# Merge datasets\n", "pd.concat([auto, auto])\n", "\n", "# This can also be used to add new rows - simply create a new dataframe and concat it to the original" ] }, { "cell_type": "code", "execution_count": null, "id": "1b8c9f98", "metadata": {}, "outputs": [], "source": [ "# Save dataset\n", "auto.to_csv('./is_lab_1_dataset.txt', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }