1. Introduction to Python for Data Science & ML
Python is widely used for data analysis, visualization and machine learning due to its:
- Rich libraries like NumPy, Pandas, Matplotlib, Scikit-Learn
- Easy syntax for rapid prototyping
- Strong community support
1.1 Setup: Install required libraries
pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras
2. Essential Python Libraries for Data Science
Library | Purpose |
NumPy | Numerical computing (arrays, matrices) |
Pandas | Data manipulation (DataFrames, Series) |
Matplotlib | Data visualization |
Seaborn | Statistical plotting |
Scikit-Learn | Machine learning algorithms |
TensorFlow / Keras | Deep learning frameworks |
3. Data Manipulation with Pandas
3.1 Loading Data
import pandas as pd
df = pd.read_csv("data.csv") # Load CSV file
df.head() # View first 5 rows
df.info() # Summary of dataset
df.dropna() # Remove missing values
df.fillna(df.mean()) # Fill missing values with mean
df["category"] = df["category"].astype("category") # Convert data type
df[df["age"] > 30] # Select rows where age > 30
df[["name", "salary"]] # Select specific columns
df.sort_values("salary", ascending=False) # Sort by salary
4. Data Visualization with Matplotlib & Seaborn
4.1 Basic Plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.plot([1, 2, 3, 4], [10, 20, 25, 30]) # Line plot
plt.show()
sns.histplot(df["salary"], bins=20) # Histogram
sns.boxplot(x=df["salary"]) # Boxplot
plt.show()
sns.scatterplot(x="age", y="salary", data=df) # Scatter plot
sns.heatmap(df.corr(), annot=True) # Correlation matrix
plt.show()
5. Machine Learning with Scikit-Learn
5.1 Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
X = df.drop("target", axis=1) # Features
y = df["target"] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
print("Accuracy:", model.score(X_test, y_test))
from sklearn.svm import SVC
model = SVC(kernel="linear")
model.fit(X_train, y_train)
print("Accuracy:", model.score(X_test, y_test))
6. Unsupervised Learning Algorithms
6.1 K-Means Clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_train)
print("Cluster Centers:", kmeans.cluster_centers_)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)
print("Explained Variance:", pca.explained_variance_ratio_)
7. Deep Learning with TensorFlow & Keras
7.1 Neural Network for Classification
import tensorflow as tf
from tensorflow import keras
model = keras.Sequential([
keras.layers.Dense(32, activation="relu", input_shape=(X_train.shape[1],)),
keras.layers.Dense(16, activation="relu"),
keras.layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
model = keras.Sequential([
keras.layers.Conv2D(32, (3,3), activation="relu", input_shape=(28,28,1)),
keras.layers.MaxPooling2D((2,2)),
keras.layers.Flatten(),
keras.layers.Dense(64, activation="relu"),
keras.layers.Dense(10, activation="softmax")
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
8. Model Evaluation & Optimization
8.1 Cross-Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5)
print("Mean Accuracy:", scores.mean())
from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators": [10, 50, 100]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
9. Deploying Machine Learning Models
9.1 Save & Load Models
import joblib
joblib.dump(model, "model.pkl") # Save model
loaded_model = joblib.load("model.pkl") # Load model
from flask import Flask, request, jsonify
import joblib
app = Flask(__name__)
model = joblib.load("model.pkl")
@app.route("/predict", methods=["POST"])
def predict():
data = request.json
prediction = model.predict([data["features"]])
return jsonify({"prediction": prediction.tolist()})
if __name__ == "__main__":
app.run(debug=True)
10. Learning Resources
- Scikit-Learn Docs
- TensorFlow/Keras Docs
- Coursera Machine Learning
- Kaggle Datasets