import cv2
import numpy as np
import matplotlib.pyplot as plt
import dlib
# Step 1: Read the Image
image_path = "Nihar.jpg" # Replace with your image path
image = cv2.imread(image_path)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.title("Original Image")
plt.show()
# Step 2: Convert to Grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
plt.imshow(gray, cmap='gray')
plt.title("Grayscale Image")
plt.show()
# Step 3: Compute Gradients
gx = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=1)
gy = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=1)
magnitude, angle = cv2.cartToPolar(gx, gy, angleInDegrees=True)
plt.imshow(magnitude, cmap='gray')
plt.title("Gradient Magnitude")
plt.show()
plt.imshow(angle, cmap='gray')
plt.title("Gradient Direction")
plt.show()
# Step 4: Create HOG Descriptors
cell_size = 8
bin_size = 9
angle_unit = 360 // bin_size
cell_x = gray.shape[1] // cell_size
cell_y = gray.shape[0] // cell_size
hog_descriptor = np.zeros((cell_y, cell_x, bin_size))
for i in range(cell_y):
for j in range(cell_x):
cell_magnitude = magnitude[i * cell_size: (i + 1) * cell_size, j * cell_size: (j + 1) * cell_size]
cell_angle = angle[i * cell_size: (i + 1) * cell_size, j * cell_size: (j + 1) * cell_size]
histogram, _ = np.histogram(cell_angle, bins=bin_size, range=(0, 360), weights=cell_magnitude)
hog_descriptor[i, j, :] = histogram
plt.imshow(hog_descriptor[:, :, 0], cmap='gray')
plt.title("HOG Descriptor")
plt.show()




detector = dlib.get_frontal_face_detector()
faces = detector(gray)
for face in faces:
x, y, w, h = face.left(), face.top(), face.width(), face.height()
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.title("Detected Faces")
plt.show()
The trained SVM model finds the optimal hyperplane that separates the face and non-face HOG feature vectors.
Detection Phase Sliding Window: The trained SVM classifier slides a fixed-size window over the input image at multiple scales and positions. HOG Feature Extraction: For each window, compute the HOG features. Classification: Use the SVM classifier to classify the HOG feature vector of each window as face (+1) or non-face (−1). The SVM decision function for a window with feature vector h is: score = 𝑤⋅ℎ + b If the score is positive, the window is classified as containing a face; otherwise, it is classified as a non-face.
import cv2
import numpy as np
from sklearn import svm
from skimage.feature import hog
from skimage import data, exposure
# Load training images and labels
face_image_paths = ["Modi.jpg","Nihar.jpg"] # List of face images
non_face_image_paths = ["backg1.jpeg","backg2.jpeg"] # List of non-face images
# Function to extract HOG features from a resized image
def extract_hog_features(image, resize_dim=(64, 128)):
# Resize image to a fixed size
resized_image = cv2.resize(image, resize_dim)
features, hog_image = hog(resized_image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True, multichannel=True)
return features
# Load images and extract HOG features
face_features = [extract_hog_features(cv2.imread(image_path)) for image_path in face_image_paths]
non_face_features = [extract_hog_features(cv2.imread(image_path)) for image_path in non_face_image_paths]
# Create training data and labels
X_train = np.array(face_features + non_face_features)
y_train = np.array([1] * len(face_features) + [-1] * len(non_face_features))
# Train the SVM classifier
clf = svm.LinearSVC()
clf.fit(X_train, y_train)<ipython-input-25-e60a415a1f7b>:15: FutureWarning: `multichannel` is a deprecated argument name for `hog`. It will be removed in version 1.0. Please use `channel_axis` instead.
features, hog_image = hog(resized_image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True, multichannel=True)
LinearSVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearSVC()
# Load the test image
test_image = cv2.imread('people.jpg')
gray_test_image = cv2.cvtColor(test_image, cv2.COLOR_BGR2GRAY)
# Define the sliding window size and step
window_size = (64, 128)
step_size = 16
# Function to extract HOG features for a grayscale image
def extract_hog_features_gray(image, resize_dim=(64, 128)):
resized_image = cv2.resize(image, resize_dim)
features, hog_image = hog(resized_image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True, channel_axis=None)
return features
# Perform sliding window
for y in range(0, gray_test_image.shape[0] - window_size[1], step_size):
for x in range(0, gray_test_image.shape[1] - window_size[0], step_size):
window = gray_test_image[y:y + window_size[1], x:x + window_size[0]]
if window.shape[0] != window_size[1] or window.shape[1] != window_size[0]:
continue
# Extract HOG features from the window
window_hog_features = extract_hog_features_gray(window)
# Predict using the trained SVM
prediction = clf.predict([window_hog_features])
# If a face is detected, draw a rectangle around it
if prediction == 1:
cv2.rectangle(test_image, (x, y), (x + window_size[0], y + window_size[1]), (0, 255, 0), 2)
# Display the result
plt.imshow(cv2.cvtColor(test_image, cv2.COLOR_BGR2RGB))
plt.title("Detected Faces")
plt.show()
Let's break down the mathematics involved in these steps:
Landmark Detection: Given an image 𝐼, the landmark detection algorithm identifies a set of points {(𝑥𝑖,𝑦𝑖)} for 𝑖 = 1 to 𝑛, where n is the number of landmarks (e.g., 68 for dlib).
Affine Transformation: The affine transformation maps the detected landmarks {(𝑥𝑖,𝑦𝑖)} to a set of predefined target points {(𝑥𝑖′,𝑦𝑖′)}. An affine transformation can be represented by a matrix A and a translation vector 𝑡: [𝑥𝑖′ 𝑦𝑖′]^T=𝐴[𝑥𝑖 𝑦𝑖]^t+ 𝑡. The affine transformation matrix 𝐴 and the translation vector 𝑡 can be computed using the least-squares method to minimize the error between the transformed landmarks and the target landmarks. This involves solving the following system of linear equations:𝑋′ = 𝐴𝑋 + 𝑇 where:𝑋 is a 2×𝑛 matrix of detected landmark coordinates.𝑋′is a 2×𝑛 matrix of target landmark coordinates.𝐴 is a 2×2 transformation matrix. 𝑇 is a 2×1 translation vector. The least-squares solution can be found using standard linear algebra techniques.
Applying the Transformation: Once 𝐴 and 𝑇 are computed, the affine transformation can be applied to the entire image using techniques such as bilinear interpolation to produce the aligned face image.
import cv2
import numpy as np
import matplotlib.pyplot as plt
# Step 1: Face Detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
image_path = "Kohli.jpg" # Replace with your image path
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
# Display detected faces
for (x, y, w, h) in faces:
cv2.rectangle(image, (x, y), (x+w, y+h), (255, 0, 0), 2)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.title("Detected Faces")
plt.show()
# Step 2: Landmark Detection (Manual Input via Code)
def manual_landmarks(image, face):
x, y, w, h = face
cropped_face = image[y:y+h, x:x+w]
plt.imshow(cv2.cvtColor(cropped_face, cv2.COLOR_BGR2RGB))
plt.title("Face Region")
plt.show()
# Manually enter coordinates here
left_eye = [110, 150]
right_eye = [300, 150]
nose = [220, 200]
left_mouth_corner = [130, 300]
right_mouth_corner = [300, 300]
points = np.array([
left_eye,
right_eye,
nose,
left_mouth_corner,
right_mouth_corner
], dtype=np.float32)
return points + [x, y]
face = faces[0]
landmarks = manual_landmarks(image, face)
# Display annotated landmarks
for (x, y) in landmarks:
cv2.circle(image, (int(x), int(y)), 6, (0, 255, 0), -1)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.title("Annotated Landmarks")
plt.show()

# Step 3: Affine Transformation
target_landmarks = np.array([
[30.2946, 51.6963], # Left eye
[65.5318, 51.5014], # Right eye
[48.0252, 71.7366], # Nose
[33.5493, 92.3655], # Left mouth corner
[62.7299, 92.2041] # Right mouth corner
], dtype=np.float32)
M, _ = cv2.estimateAffinePartial2D(landmarks, target_landmarks)# Step 4: Applying the Transformation
aligned_face_size = (96, 112)
aligned_face = cv2.warpAffine(image, M, aligned_face_size)
# Display the aligned face
plt.imshow(cv2.cvtColor(aligned_face, cv2.COLOR_BGR2RGB))
plt.title("Aligned Face")
plt.show()
Mathematically, a face embedding is a vector 𝑓 ∈ 𝑅^𝑑, where 𝑑 is the dimensionality of the embedding space. For example, 𝑑 might be 128, 256, or even higher.
Given a face image 𝑥, the CNN outputs the face embedding 𝑓(𝑥).
To train the CNN to produce meaningful embeddings, a suitable loss function is used. Common loss functions include: Triplet Loss: Ensures that the distance between an anchor face and a positive face (same person) is smaller than the distance between the anchor face and a negative face (different person) by a margin 𝛼.
Mathematically:
L(a,p,n)=max(0, ∥f(a)−f(p)∥ ^2 − ∥f(a)−f(n)∥ ^2 + α) where 𝑎 is the anchor image, 𝑝 is the positive image, and 𝑛 is the negative image.
Contrastive Loss: Similar to triplet loss but uses pairs of images. Minimizes the distance between embeddings of similar pairs and maximizes the distance between embeddings of dissimilar pairs. Mathematically:
𝐿(𝑥1,𝑥2,𝑦) = 𝑦⋅∥ 𝑓(𝑥1) − 𝑓(𝑥2) ∥2^2 + (1−𝑦)⋅max(0,𝑚−∥ 𝑓(𝑥1)−𝑓(𝑥2) ∥2)^2 where 𝑦 is 1 if 𝑥1 and 𝑥2 are the same person, and 0 otherwise. 𝑚 is the margin.
𝑑( 𝑓(𝑥1), 𝑓(𝑥2)) = ∥ 𝑓(𝑥1) − 𝑓(𝑥2) ∥^2 Alternatively, cosine similarity can also be used: cosine(𝑓(𝑥1),𝑓(𝑥2)) = 𝑓(𝑥1)⋅𝑓(𝑥2) / ∥𝑓(𝑥1)∥⋅∥𝑓(𝑥2)∥
import numpy as np
import cv2
import matplotlib.pyplot as plt
def display_image(image, title="Image"):
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.title(title)
plt.axis('off')
plt.show()For more detailed view on CNN and for viewing CNN from scratch you can view this project https://github.com/Nihar1402-iit/Animal-Classification and for understanding significance of each layer's output you can view this .ipynb file https://github.com/Nihar1402-iit/Animal-Classification/blob/main/Visualising_CNN_layer_outputs%20(1).ipynb
def conv_layer(input_data, filters, kernel_size):
output = np.zeros((input_data.shape[0] - kernel_size + 1, input_data.shape[1] - kernel_size + 1, filters))
for f in range(filters):
kernel = np.random.rand(kernel_size, kernel_size, input_data.shape[2])
for i in range(output.shape[0]):
for j in range(output.shape[1]):
output[i, j, f] = np.sum(input_data[i:i + kernel_size, j:j + kernel_size] * kernel)
return outputdef create_embedding(image):
conv1 = conv_layer(image, filters=16, kernel_size=3)
conv1_flat = conv1.flatten()
embedding = np.random.rand(128) # Simulated embedding for simplicity
return embeddingdef euclidean_distance(embedding1, embedding2):
distance = np.sqrt(np.sum((embedding1 - embedding2)**2))
return distance
# Load and preprocess images
image1_path = 'Nihar.jpg'
image2_path = 'Photo.jpg'
image1 = cv2.imread(image1_path)
image2 = cv2.imread(image2_path)
# Display the images
display_image(image1, title="Image 1")
display_image(image2, title="Image 2")
# Create embeddings for the two images
embedding1 = create_embedding(image1)
embedding2 = create_embedding(image2)
print("Embedding 1:", embedding1)
print("Embedding 2:", embedding2)
# Compute the Euclidean distance between the embeddings
distance = euclidean_distance(embedding1, embedding2)
# Output the result
print("Euclidean distance between the embeddings:", distance)

Embedding 1: [0.50090798 0.68556543 0.60634613 0.04555698 0.42117226 0.12389472
0.40752322 0.69771166 0.65115706 0.78907634 0.39659054 0.01229054
0.57648175 0.35039318 0.36786503 0.30195382 0.16743631 0.14183538
0.07252658 0.96740281 0.56131913 0.007655 0.95579219 0.96250862
0.2458595 0.05771523 0.6862438 0.48944646 0.02867164 0.71319904
0.28748339 0.90596183 0.13229311 0.14710764 0.30011579 0.57796062
0.97774559 0.19173074 0.32664853 0.50124601 0.03151652 0.73231335
0.54809433 0.40108866 0.67184793 0.14517403 0.92590791 0.88231625
0.76225221 0.78073746 0.05698118 0.65362558 0.50942832 0.20717422
0.28720273 0.00114903 0.28490813 0.97015143 0.56660218 0.09821421
0.27156248 0.08910303 0.68425606 0.246987 0.01158745 0.38798838
0.71085834 0.82676391 0.2726909 0.6268035 0.36295264 0.51363099
0.27311266 0.61309813 0.07133517 0.32779402 0.14759681 0.18086154
0.67829921 0.65782799 0.74371878 0.05855871 0.25407141 0.93440054
0.75849447 0.13894913 0.90272469 0.5385578 0.27763779 0.48986428
0.45899466 0.89208973 0.40126875 0.33737506 0.7536663 0.27940002
0.96141973 0.2672932 0.19475347 0.013819 0.01967799 0.22462044
0.27899598 0.06760692 0.68082595 0.82954954 0.14560864 0.73959653
0.81081425 0.35216434 0.16212163 0.50074827 0.25690969 0.87048844
0.02342349 0.97865821 0.07346921 0.90255308 0.50150539 0.34858276
0.84423242 0.52909897 0.98113529 0.48001996 0.50258278 0.52209856
0.46787589 0.28034589]
Embedding 2: [0.40707889 0.78145408 0.53775274 0.68582284 0.318276 0.10673259
0.92552835 0.32845766 0.07799383 0.27152369 0.95083546 0.38553018
0.74564498 0.11482168 0.36641813 0.46872199 0.92990912 0.51832993
0.03675482 0.44998455 0.61781097 0.65173081 0.93114862 0.01958604
0.25150304 0.19779058 0.88188435 0.90822207 0.52104796 0.66136107
0.32725397 0.61574707 0.61524569 0.7277313 0.35390828 0.36508688
0.38357755 0.81641608 0.18896015 0.92411114 0.84417664 0.08990048
0.96946097 0.17931247 0.09034329 0.39285994 0.81858604 0.67794773
0.62443164 0.32195309 0.86982373 0.89203423 0.45888061 0.84243765
0.02167378 0.47555125 0.26876598 0.0457461 0.94192152 0.32707493
0.75178676 0.13816513 0.75829763 0.25496302 0.28597139 0.02790209
0.86403075 0.3044784 0.57123578 0.90040627 0.52512916 0.56068001
0.5420624 0.52091622 0.62101642 0.65614532 0.379391 0.89098246
0.93327701 0.92080779 0.94056103 0.37036275 0.07806965 0.59555389
0.51757477 0.44527472 0.50852663 0.61203073 0.79005372 0.18054729
0.66857457 0.27305091 0.38963139 0.24818901 0.04095087 0.50686962
0.33605933 0.37118557 0.33361385 0.36380735 0.39793351 0.41927725
0.8207811 0.33351102 0.16702583 0.60985687 0.67621718 0.47734072
0.15022712 0.113156 0.98807971 0.43934677 0.02361379 0.46766381
0.86040533 0.13629998 0.88679597 0.33920613 0.1885231 0.23448822
0.72638796 0.48868084 0.74158088 0.91484917 0.40087097 0.65283275
0.20496932 0.24280169]
Euclidean distance between the embeddings: 1.144180441627088
# Load and preprocess images
image1_path = 'ABD.jpeg'
image2_path = 'Photo.jpg'
image1 = cv2.imread(image1_path)
image2 = cv2.imread(image2_path)
# Display the images
display_image(image1, title="Image 1")
display_image(image2, title="Image 2")
# Create embeddings for the two images
embedding1 = create_embedding(image1)
embedding2 = create_embedding(image2)
print("Embedding 1:", embedding1)
print("Embedding 2:", embedding2)
# Compute the Euclidean distance between the embeddings
distance = euclidean_distance(embedding1, embedding2)
# Output the result
print("Euclidean distance between the embeddings:", distance)

Embedding 1: [0.42915152 0.14450937 0.42106832 0.12403381 0.57705514 0.69841666
0.5023928 0.14045134 0.18144308 0.48563223 0.08831167 0.49786946
0.87023444 0.6129135 0.0583797 0.23443961 0.77558832 0.23584132
0.14184071 0.07888598 0.44880824 0.57653653 0.52740697 0.16966592
0.35396347 0.80510237 0.61334737 0.09533667 0.11971344 0.76291824
0.50361654 0.24467738 0.20922351 0.16274174 0.63290519 0.07223353
0.43825876 0.61856285 0.70695157 0.80766254 0.5568173 0.00477979
0.20778051 0.98435932 0.77154716 0.65583431 0.98649087 0.10180467
0.77744441 0.5660854 0.79197367 0.19115178 0.78215893 0.37929528
0.51495333 0.07561589 0.62455161 0.83527455 0.68737513 0.14424087
0.73594241 0.62191399 0.37791929 0.54838246 0.42689653 0.11205038
0.85614482 0.26649988 0.38550346 0.37051301 0.26802579 0.05220999
0.64266024 0.4790959 0.02509311 0.46656224 0.67320809 0.74300029
0.49072747 0.03361778 0.50305003 0.80380809 0.15044487 0.27496374
0.27689056 0.37203888 0.87851442 0.15180087 0.71505334 0.86398127
0.0268368 0.2067848 0.94621239 0.47328881 0.00352634 0.45382081
0.4388015 0.88320874 0.1793939 0.25046008 0.59530011 0.18087924
0.436415 0.17280871 0.02777885 0.2693864 0.54330909 0.72101838
0.88327367 0.97708073 0.27574186 0.81023383 0.71482667 0.72601973
0.38436422 0.05197138 0.84975448 0.07296656 0.20391881 0.66876278
0.14505876 0.42808342 0.10355774 0.74406651 0.79945757 0.79479832
0.05520095 0.27249089]
Embedding 2: [0.02307597 0.89277339 0.26573991 0.89939564 0.46048021 0.44573955
0.36998779 0.88054925 0.86087557 0.7015089 0.51024037 0.57779455
0.16485369 0.30319802 0.01530814 0.82933482 0.67268835 0.48128209
0.62548777 0.65244767 0.60093039 0.74246289 0.84772435 0.93126437
0.00591811 0.26941669 0.58330435 0.58743071 0.49160196 0.69620039
0.18484647 0.95181865 0.6788908 0.86245059 0.02847861 0.63293369
0.79328434 0.00607425 0.87510021 0.89948011 0.46384724 0.10176334
0.35381204 0.12768356 0.26029603 0.91543653 0.83649377 0.02277863
0.7328796 0.77098442 0.12014739 0.77724742 0.0418065 0.72141856
0.5447721 0.27646813 0.10682262 0.83357216 0.40993082 0.09260579
0.70401718 0.36389313 0.06479282 0.41196076 0.98364302 0.16985866
0.83264502 0.21117039 0.90510552 0.32828453 0.43805702 0.23809758
0.27804553 0.46560637 0.51461762 0.63171827 0.67932022 0.01976233
0.20375323 0.73834901 0.96933845 0.00706723 0.65171849 0.94213684
0.8114543 0.46855068 0.25206881 0.97407779 0.27622368 0.45920011
0.63811725 0.72522369 0.51283743 0.38173128 0.55347245 0.81371482
0.43138638 0.07585522 0.28828567 0.24372746 0.96198271 0.81942255
0.48480163 0.14326023 0.93063813 0.16788817 0.95672316 0.41542856
0.12884057 0.05684331 0.8071643 0.83969342 0.61913881 0.15840827
0.71602283 0.69884705 0.04376727 0.75614174 0.51060469 0.2569266
0.15995467 0.84021051 0.80049285 0.09775558 0.91283511 0.13845791
0.65585756 0.29661978]
Euclidean distance between the embeddings: 5.132558660460171