{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "faa98ef5",
   "metadata": {},
   "source": [
    "# Introduction to Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0034e66e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculator\n",
    "a = (50 + 1.45) / 12.5 # (CTRL + ENTER)\n",
    "b = \"hello\" + \" \" + \"world\"\n",
    "\n",
    "# print\n",
    "print(a)\n",
    "print(b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c3cd3e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lists\n",
    "x = [3, \"hello\", [3, 4]]\n",
    "y = [1, 2, 3]\n",
    "print(x + y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b66afe9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Flow control\n",
    "for i in range(10):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c4cd1eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in x:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37ba634c",
   "metadata": {},
   "outputs": [],
   "source": [
    "i = 0\n",
    "while i < 10:\n",
    "    print(i)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b37c580",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Functions\n",
    "def plus(a, b):\n",
    "    return a + b\n",
    "\n",
    "plus(1, 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7930b02",
   "metadata": {},
   "source": [
    "## Numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e21da6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install library (Jupyter 7.3 or later)\n",
    "# %pip install numpy  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8f476ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Older versions\n",
    "# import sys\n",
    "# !{sys.executable} -m pip install numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1da857f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import library\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba2a6c3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Numpy arrays\n",
    "x = np.array([3, 4, 5])\n",
    "y = np.array([4, 9, 7])\n",
    "\n",
    "print(x)\n",
    "print(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bc27cf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Numpy allows mathematical operations between arrays\n",
    "print(x+y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "171fae9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Different from normal lists\n",
    "x = [3,4,5]\n",
    "y = [4,9,7]\n",
    "print(x+y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4174aa17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2-dimensional arrays\n",
    "\n",
    "a = np.array([[1, 2], [3, 4]])\n",
    "b = np.array([[5, 6], [7, 8]])\n",
    "\n",
    "print(a*b)\n",
    "\n",
    "a.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "750cd7c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Documentation\n",
    "np.array?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f121dc20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Array dimensions\n",
    "print(a.ndim)    # Number of dimensions\n",
    "print(a.shape)   # Shape of the array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "734e4cad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reshaping arrays\n",
    "a = np.array([[1, 2], [3, 4]])\n",
    "print(a.reshape([1,4]))  # [rows, columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01f0c1be",
   "metadata": {},
   "outputs": [],
   "source": [
    "c = np.array([1, 2, 3, 4, 5, 6])\n",
    "print(c.reshape(2, 3))    # Convert a 1d array to a 2x3 array"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ded17360",
   "metadata": {},
   "source": [
    "## Indexing Arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "843ba712",
   "metadata": {},
   "outputs": [],
   "source": [
    "A = np.array(np.arange(16)).reshape((4, 4))\n",
    "A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1c07625",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select single element\n",
    "\n",
    "print(A[1,2]) #[row, column]\n",
    "# or\n",
    "print(A[1][2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a346a413",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select entire row/column\n",
    "print(A[0,:])    # : selects everything\n",
    "\n",
    "print(A[0])\n",
    "\n",
    "print(A[:,0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c8ec982",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select multiple rows/columns\n",
    "print(A[[1, 2],:])    # 2nd and 3rd row\n",
    "print(A[:, [2,3]])    # 3rd and 4th column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "970c8bcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select a submatrix\n",
    "print(A[[0,1], [0,1]]) # Doesn't work, selects individual elements [0,0] and [1,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b44ff2c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(A[[0,1]][:,[0,1]]) # Select the rows first, then the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7169f2f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(A[0:2, 0:2]) # Or with slices"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13109eb0",
   "metadata": {},
   "source": [
    "## Boolean indexing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bde8681",
   "metadata": {},
   "outputs": [],
   "source": [
    "A = [0,1,2,3,4,5,6,7,8,9]\n",
    "B = [True, False, True, False, True, False, True, False, True, False]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8f76526",
   "metadata": {},
   "outputs": [],
   "source": [
    "A[B]  # Error - This does not work with python arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc641cfc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# With numpy arrays, we select every element, where B == True\n",
    "A = np.array(A)\n",
    "print(A[B])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af5c5675",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We can use boolean indexing to select elements matching specific conditions\n",
    "print(A[A > 5])\n",
    "print(A[A % 2 == 0])\n",
    "print(A[(A > 2) & (A < 7)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42f640d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Also works with n-dimensional arrays\n",
    "A = np.array(np.arange(16)).reshape((4, 4))\n",
    "print(A)\n",
    "rows = [True, False, True, False]\n",
    "cols = [True, True, False, False]\n",
    "A[rows][:,cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43f3ac10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select rows that sum up to more than 25\n",
    "high_rows = np.sum(A, axis=1) > 25\n",
    "print(high_rows)\n",
    "A[high_rows]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "234d97d7",
   "metadata": {},
   "source": [
    "# Pandas and Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daed4c11",
   "metadata": {},
   "outputs": [],
   "source": [
    "# In addition to Numpy arrays, we can use additional libraries to better process larger datasets\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b86ef5ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "auto = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')\n",
    "auto"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4749e43d",
   "metadata": {},
   "source": [
    "## Pandas dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "286249cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# How many examples are in a dataframe\n",
    "len(auto)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2c17db7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print column names\n",
    "auto.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a19bcb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Are there missing values in the dataset\n",
    "any(auto.isna())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66ab1a52",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count the missing values per column\n",
    "auto.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1649285b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop missing values\n",
    "auto = auto.dropna()\n",
    "print(len(auto))\n",
    "print(auto.isna().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f21a7ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop columns\n",
    "auto.drop(columns=\"horsepower\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1abfc5f7",
   "metadata": {},
   "source": [
    "### Indexing Pandas datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aee51e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "auto[1] # Error, cannot index by numbers "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db7fb858",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(auto['mpg'])  # Index by column names instead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e969e76d",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(auto[:3])     # Or use slices to select rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "431d5eff",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(auto.loc[0,\"mpg\"]) # Or with .loc[row_numebr, column_name]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb061ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(auto.iloc[0,0])    # Or with .iloc[row_number, column_number]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99bd5b0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(auto.loc[0:5, [\"mpg\", \"cylinders\"]])  # Can also select multiple rows and columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "692f0567",
   "metadata": {},
   "source": [
    "We can use logical indexing to select specific rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46002a64",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx_80 = auto['model_year'] > 80\n",
    "print(auto.loc[idx_80, ['name', 'model_year']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da464d95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Or in one line\n",
    "print(auto.loc[auto['model_year'] > 80, ['name', 'model_year']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c5728ef",
   "metadata": {},
   "source": [
    "## Adding values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73dae125",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create new column\n",
    "auto['Horsepower / Weight'] = auto['horsepower'] / auto['weight']\n",
    "auto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6a64513",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(auto.columns)\n",
    "# Create new (empty) row\n",
    "auto.append(pd.Series([None,None,None,None,None,None,None,None, None, None], index = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',\n",
    "       'acceleration', 'model_year', 'origin', 'name', 'Horsepower / Weight']), ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0717e538",
   "metadata": {},
   "source": [
    "## Additional functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d74d579f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge datasets\n",
    "pd.concat([auto, auto])\n",
    "\n",
    "# This can also be used to add new rows - simply create a new dataframe and concat it to the original"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b8c9f98",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save dataset\n",
    "auto.to_csv('./is_lab_1_dataset.txt', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}