Skip to content

Commit

Permalink
Data Preprocessing and Anomaly Detection
Browse files Browse the repository at this point in the history
  • Loading branch information
athiya26 committed Nov 20, 2023
1 parent ab8433a commit 09b1264
Show file tree
Hide file tree
Showing 3 changed files with 380 additions and 0 deletions.
138 changes: 138 additions & 0 deletions model_data_with_anomalies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
source,title,price,location,odometer,image_count,anomaly_score,has_anomaly
facebook,ram laramie x diesel,4999.0,"Houston, TX",100000.0,1,0.18596239000029452,0
facebook,chevrolet silverado crew cab rst pickup ft,52888.0,"Houston, TX",17000.0,1,0.10085177902743259,0
facebook,nissan versa sedan,7500.0,"Houston, TX",42000.0,1,0.16429336849520748,0
facebook,toyota runner trd premium sport utility,43602.0,"Houston, TX",19000.0,1,0.1245762809579582,0
facebook,ford f super duty crew cab platinum pickup ft,79992.0,"Houston, TX",25000.0,1,0.05489800948335205,0
facebook,ford mustang gt coupe,4999.0,"Spring, TX",9000.0,1,0.030995813296170405,1
facebook,kia k lxs sedan,20929.0,"Houston, TX",68000.0,1,0.15092247425696403,0
facebook,ford bronco black diamond sport utility,42995.0,"Houston, TX",13000.0,1,0.12393566265303096,0
facebook,ford expedition xlt sport utility,38995.0,"Houston, TX",65000.0,1,0.13088074579512832,0
craigslist,ford f king ranch x diesel nav roof new wheels tires,39995.0,WWW.GETADIESEL.COM,122000.0,24,0.14230162498520027,0
craigslist,ford f platinum x diesel nav sunroof toy tires,33885.0,WWW.GETADIESEL.COM,194000.0,24,0.13864289252206835,0
facebook,ford expedition max platinum sport utility,33739.0,"Houston, TX",100000.0,1,0.1446137154822576,0
craigslist,ram x cummins neckover flatbed new tires,38885.0,WWW.GETADIESEL.COM,156000.0,24,0.14113540010860937,0
facebook,ford explorer st sport utility,45788.0,"Houston, TX",21000.0,1,0.1218959689428955,0
craigslist,ram x cummins back cam b w hitch tx truck,32999.0,WWW.GETADIESEL.COM,209000.0,24,0.13630659675498263,0
facebook,lincoln aviator black label grand touring sport utility,85488.0,"Houston, TX",11000.0,1,0.042989215628509436,1
craigslist,bmw x,9500.0,Abilene,141000.0,10,0.06327754725917423,0
facebook,volkswagen atlas cross sport sel motion sport utility,35588.0,"Houston, TX",53000.0,1,0.13865588271484158,0
craigslist,ford lariat x v leather nav leveled new,27990.0,WWW.GETADIESEL.COM,109000.0,23,0.1191502343549033,0
facebook,ford f supercrew cab king ranch pickup ft,32998.0,"Houston, TX",90000.0,1,0.14349622056540895,0
craigslist,ford xl x diesel skirted cm flatbed new tires,49999.0,WWW.GETADIESEL.COM,71000.0,24,0.10649685605121312,0
facebook,ford maverick lariat pickup ft,36888.0,"Houston, TX",9000.0,1,0.12789629559974452,0
craigslist,ford super duty platinum crew cab,79900.0,Ford F-450 Super Duty,78000.0,24,-0.0178812849770843,1
facebook,toyota camry xse,2500.0,"Houston, TX",83000.0,1,0.1799660885783036,0
facebook,nissan sentra,8200.0,"Houston, TX",107000.0,1,0.18476581416772508,0
craigslist,peterbilt,42900.0,Peterbilt 579 w/sleeper,627000.0,22,-0.03071784696102875,1
facebook,ford f supercrew cab fx pickup ft,,"Houston, TX",118000.0,1,0.17950933276122344,0
craigslist,ram power wagon x hemi nav winch ram boxes new,35995.0,WWW.GETADIESEL.COM,107000.0,24,0.14183774540922023,0
facebook,dodge journey crossroad sport utility,1500.0,"Houston, TX",114000.0,1,0.17403495961802645,0
craigslist,ford f lariat x diesel leather nav roof new mt tires,28885.0,WWW.GETADIESEL.COM,202000.0,24,0.13989820895145788,0
craigslist,ram x cummins bfg tires priced trade value,24850.0,WWW.GETADIESEL.COM,196000.0,24,0.12870939627196742,0
facebook,buick enclave sport utility,3500.0,"Houston, TX",138000.0,1,0.1573383751826588,0
craigslist,ford king ranch x diesel nav roof gooseneck toyo ht tires,31800.0,WWW.GETADIESEL.COM,227000.0,24,0.1291563059292763,0
facebook,ford xlt pickup x,,"Houston, TX",113000.0,1,0.17926225200608426,0
craigslist,ford f supercrew platinum,40000.0,"Breckenridge, Texas",52000.0,14,-0.0027951370855203717,1
facebook,ram mega cab,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0
craigslist,hyundai n line night edition,1.0,Abilene,43000.0,8,0.03979466120983155,1
facebook,gmc sierra z x de enganche,4900.0,"Houston, TX",,1,0.1874700834395005,0
craigslist,ram laramie x cummins nav leveled new,29987.0,WWW.GETADIESEL.COM,200000.0,24,0.14122606551809191,0
facebook,ford f supercrew cab lariat pickup ft,13500.0,"Spring, TX",203000.0,1,0.030858986598606586,1
craigslist,chevy hd ltz x duramax skirted hydraulic hay bed,44995.0,WWW.GETADIESEL.COM,143000.0,24,0.13079924417412814,0
facebook,ram crew cab st pickup ft,4500.0,"Houston, TX",130000.0,1,0.17200986926549283,0
craigslist,gmc canyon elevated,45000.0,Abilene,37000.0,4,0.03465979717464063,1
facebook,ram,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0
craigslist,ford platinum x diesel lift new meyhems new,42995.0,WWW.GETADIESEL.COM,142000.0,24,0.13768723857478643,0
facebook,toyota corolla le plus sedan,9800.0,"Sugar Land, TX",75000.0,1,0.05611327436419844,0
craigslist,ford lariat x v auto nav black leather tx truck,27999.0,WWW.GETADIESEL.COM,109000.0,24,0.13904857016926075,0
facebook,toyota camry se sedan,2199.0,"Houston, TX",67000.0,1,0.1792899399858539,0
facebook,lincoln mkz sedan,6499.0,"Sugar Land, TX",127000.0,1,0.04385667950054006,1
craigslist,ford king ranch fx diesel nav bds leveling kit fox shocks,43850.0,WWW.GETADIESEL.COM,180000.0,24,0.12556633034364273,0
facebook,gmc sierra z slt x de enganche,7000.0,"Houston, TX",,1,0.1848175282650657,0
craigslist,ram x cummins back cam bfg tires gooseneck,48888.0,WWW.GETADIESEL.COM,131000.0,24,0.12066310860143203,0
facebook,gmc yukon,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0
craigslist,ram lonestar x cummins leveled new tires,49900.0,WWW.GETADIESEL.COM,126000.0,22,0.10459576629012846,0
facebook,chevrolet silverado z x de enganche,4400.0,"Houston, TX",,1,0.18714564823462826,0
craigslist,ram x hemi lift methods new,32990.0,WWW.GETADIESEL.COM,112000.0,18,0.09063296208902921,0
facebook,ford sport de enganche,5000.0,"Houston, TX",,1,0.18753136090512046,0
craigslist,ram laramie longhorn mega x cummins nav sunroof,29900.0,WWW.GETADIESEL.COM,228000.0,24,0.12913977538617455,0
facebook,chevrolet camaro rs,4000.0,"Pasadena, TX",130000.0,1,0.05475745759401013,0
craigslist,ram x cummins back cam nitto tires tx truck,28995.0,WWW.GETADIESEL.COM,215000.0,24,0.13808741550777864,0
facebook,ford lariat x de enganche,5500.0,"Houston, TX",,1,0.18761996593143898,0
craigslist,cadillac cts turbo automatic black leather well maintained,10900.0,WWW.GETADIESEL.COM,133000.0,24,0.10791024561852375,0
facebook,chevrolet silverado crew cab ls pickup ft,13000.0,"Cypress, TX",154000.0,1,0.02171750937448541,1
craigslist,mercedes benz suv,16900.0,Snyder,128000.0,12,0.026960617079861193,1
facebook,gmc sierra slt x de enganche,4400.0,"Houston, TX",,1,0.18714564823462826,0
craigslist,jeep patriot,7450.0,"Potosi, TX",104000.0,10,-0.011446317475731438,1
craigslist,wd laramie turbo ram,51000.0,Haskell,83000.0,7,0.04599964570521936,1
facebook,nissan altima,1900.0,"Houston, TX",36000.0,1,0.1707998571729935,0
craigslist,ford f fx crew cab great truck hail ding special,15995.0,Clyde,102000.0,24,0.02197260350359459,1
facebook,toyota corolla le special edition sedan,1599.0,"Houston, TX",108000.0,1,0.1817734713343247,0
craigslist,hyundai elantra,7000.0,ABILENE,151000.0,9,0.05029596196625413,0
facebook,dodge avenger sxt sedan,3100.0,"Houston, TX",121000.0,1,0.18009361046234534,0
craigslist,hyundai accent,7000.0,ABILENE,118000.0,7,0.05594094445557429,0
facebook,chevrolet silverado crew cab ltz pickup ft,2795.0,"Houston, TX",8000.0,1,0.14986769699766278,0
craigslist,chev impala,5000.0,ABILENE,251000.0,5,0.017526257955390256,1
facebook,lexus gs sedan,13500.0,"Houston, TX",105000.0,1,0.16806550282781013,0
craigslist,chevy,7500.0,ABILENE,122000.0,8,0.05477290460728734,0
facebook,toyota camry se nightshade edition sedan,1699.0,"Houston, TX",62000.0,1,0.1774492931727455,0
craigslist,toyota runner sr premium x v leather nav nitto,38985.0,WWW.GETADIESEL.COM,48000.0,23,0.10462856961402456,0
facebook,nissan altima sedan,8500.0,"Houston, TX",125000.0,1,0.1723584804683761,0
craigslist,wd laramie turbo ram,51000.0,Haskell,83000.0,7,0.04599964570521936,1
facebook,chevrolet silverado hd crew cab ltz x diesel sale trade,21500.0,"Houston, TX",210000.0,1,0.07556318897918501,0
craigslist,ram longhorn mega cab x cummins saddle leather nav,37995.0,WWW.GETADIESEL.COM,170000.0,24,0.1392140833500518,0
facebook,volkswagen jetta sedan,4200.0,"Spring, TX",160000.0,1,0.04167203165437211,1
craigslist,hyundai tuscon limited suv ready go,9895.0,Clyde,129000.0,20,0.04166519899940785,1
facebook,chevrolet suburban de enganche,4000.0,"Houston, TX",,1,0.18780700437853798,0
craigslist,ram x cummins auto cm skirted flatbed tx truck,23880.0,WWW.GETADIESEL.COM,198000.0,24,0.12496835375674564,0
facebook,chevrolet silverado lt de enganche,4500.0,"Houston, TX",,1,0.18714564823462826,0
craigslist,hyundai santa fe sport,9895.0,Clyde,125000.0,18,0.03327665339692032,1
facebook,chevrolet silverado z ltz pickup x,4495.0,"Katy, TX",112000.0,1,0.014676370351309698,1
craigslist,ram x cummins auto cloth new mt tires,25995.0,WWW.GETADIESEL.COM,146000.0,24,0.13080431946714566,0
facebook,audi q premium plus sport utility,2000.0,"Houston, TX",110000.0,1,0.18250311252758095,0
craigslist,ram slt x hemi leather b w hitch new,29900.0,WWW.GETADIESEL.COM,113000.0,24,0.14399505276303026,0
facebook,honda accord lx sedan,1700.0,"Houston, TX",30000.0,1,0.1641144807873059,0
craigslist,credit check toyota rav xle guaranteed approval,2500.0,www.DEPOTAUTOSALES.com,112000.0,9,0.002898395907380946,1
facebook,toyota camry,2999.0,"Houston, TX",70000.0,1,0.17909161271208118,0
craigslist,hyundai kona door hatchback,9895.0,Clyde,138000.0,21,0.03415303203722447,1
facebook,gmc yukon slt xl de enganche,5000.0,"Houston, TX",,1,0.18753136090512046,0
craigslist,ford lariat x diesel nav sunroof new tires,21800.0,WWW.GETADIESEL.COM,297000.0,24,0.09055779550252968,0
facebook,ram promaster,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0
craigslist,nissan rogue sv low miles,6500.0,Abilene,82000.0,10,0.06859122325442468,0
facebook,chevrolet silverado ls,4000.0,"Houston, TX",100000.0,1,0.18623941753769735,0
craigslist,ram power wagon x hemi nav winch ram boxes new,35995.0,WWW.GETADIESEL.COM,107000.0,24,0.14183774540922023,0
facebook,chevrolet silverado z ltz pickup x,,"Houston, TX",109000.0,1,0.18384758690876263,0
craigslist,nissan titan xd sl x v leather lift xds,29999.0,WWW.GETADIESEL.COM,90000.0,24,0.13336917160867512,0
facebook,toyota corolla se sedan,1699.0,"Houston, TX",45000.0,1,0.173492849128834,0
craigslist,gmc sierra hd sle x duramax x hostiles new,34990.0,WWW.GETADIESEL.COM,178000.0,24,0.13914141662536728,0
facebook,bmw series,2999.0,"Houston, TX",90000.0,1,0.1831797028453639,0
facebook,cadillac escalade,2000.0,"Houston, TX",59000.0,1,0.18063621572482608,0
facebook,ford ecoboost lifted de enganche,4900.0,"Houston, TX",,1,0.1874700834395005,0
facebook,honda civic sport sedan,1600.0,"Houston, TX",39000.0,1,0.16960915968965506,0
facebook,toyota corolla hatchback xse hatchback,2150.0,"Houston, TX",100000.0,1,0.18551098054038923,0
facebook,kia optima lx sedan,6800.0,"Humble, TX",91000.0,1,0.02860455322330069,1
facebook,toyota sienna se minivan,3000.0,"Houston, TX",120000.0,1,0.17907050597671365,0
facebook,toyota tacoma double cab trd sport pickup ft,3000.0,"Houston, TX",1000.0,1,0.14085733189109162,0
facebook,toyota tacoma double cab limited pickup ft,2999.0,"Houston, TX",90000.0,1,0.1831797028453639,0
facebook,honda civic dx sedan,5800.0,"Houston, TX",140000.0,1,0.164039503936043,0
facebook,toyota camry le sedan,6000.0,"Houston, TX",48000.0,1,0.1704424610556231,0
facebook,chrysler town country limited minivan,6950.0,"Houston, TX",140000.0,1,0.16174675471888023,0
facebook,ford mustang v coupe,10950.0,"Houston, TX",109000.0,1,0.17764476231710002,0
facebook,gmc sierra denali payment,5400.0,"Houston, TX",,1,0.18761996593143898,0
facebook,toyota corolla xse sedan,1699.0,"Houston, TX",41000.0,1,0.17447650468938009,0
facebook,nissan sentra sv,2999.0,"Houston, TX",180000.0,1,0.100369068821408,0
facebook,dodge charger,1200.0,"Houston, TX",82000.0,1,0.17329388620025316,0
facebook,chevrolet silverado crew cab lt pickup ft,22000.0,"Houston, TX",87000.0,1,0.15567646230885518,0
facebook,chevrolet silverado z,3000.0,"Houston, TX",,1,0.1876577958717826,0
facebook,toyota camry se sedan,1600.0,"Houston, TX",51000.0,1,0.1748493915305913,0
facebook,toyota camry se sedan,1699.0,"Houston, TX",27000.0,1,0.1630204107016024,0
facebook,honda civic,1600.0,"Houston, TX",38000.0,1,0.16960915968965506,0
facebook,chevrolet malibu lt sedan,1699.0,"Houston, TX",59000.0,1,0.17922929984416724,0
facebook,toyota corolla le eco premium sedan,9800.0,"Sugar Land, TX",74000.0,1,0.056526107666673986,0
facebook,toyota xle premium sport utility,2999.0,"Houston, TX",90000.0,1,0.1831797028453639,0
facebook,ford f supercrew cab platinum pickup ft,2995.0,"Katy, TX",8000.0,1,-0.006943396072467856,1
facebook,gmc canyon denali de enganche,4500.0,"Houston, TX",,1,0.18714564823462826,0
facebook,chrysler touring sedan,13800.0,"Garland, TX",70000.0,1,0.0180201601574127,1
facebook,toyota corolla le sedan,2000.0,"Arlington, TX",145000.0,1,0.05254372603546836,0
facebook,ford,3400.0,"Arlington, TX",79000.0,1,0.05778912416568971,0
104 changes: 104 additions & 0 deletions scam_score_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline # Importing Pipeline
from sklearn.ensemble import IsolationForest
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# Load data
data = pd.read_csv('data-1.csv')

# 1. Feature Selection and Preprocessing
features = data[['source', 'title', 'price', 'location', 'odometer']].copy()

# New Feature: Count of images
image_columns = [col for col in data.columns if col.startswith('images/')]
data['image_count'] = data[image_columns].notna().sum(axis=1)
features['image_count'] = data['image_count']

# Handling missing values in text columns
features[['source', 'title', 'location']].fillna('', inplace=True)

# Convert price and odometer to numerical values
features['price'] = features['price'].replace('[\$,]', '', regex=True)
features['price'] = pd.to_numeric(features['price'], errors='coerce')

# Convert odometer to numerical values
def convert_odometer(odometer_str):
if pd.isna(odometer_str):
return None
number_part = re.findall(r'\d+', odometer_str)
if number_part:
number = int(number_part[0])
if 'k' in odometer_str.lower():
return number * 1000 # Convert 'k' to thousands
return number
return None

features['odometer'] = features['odometer'].apply(convert_odometer)

# Text preprocessing for 'title'
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
text = text.lower()
text = re.sub(r'\d+', '', text) # Remove numbers
text = word_tokenize(text)
text = [word for word in text if word.isalpha()] # Remove non-alphabetic tokens
text = [word for word in text if not word in stop_words] # Remove stopwords
return ' '.join(text)

features['title'] = features['title'].apply(clean_text)

# Preprocessing Pipeline
num_features = ['price', 'odometer', 'image_count']
cat_features = ['source', 'location']

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
transformers=[
('num', num_pipeline, num_features),
('cat', OneHotEncoder(), cat_features)
])

# 2. Unsupervised Anomaly Detection
# Applying Isolation Forest for anomaly detection
model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
features_preprocessed = preprocessor.fit_transform(features)
model.fit(features_preprocessed)

# 3. Score the listings and save results
scores = model.decision_function(features_preprocessed)

# Saving preprocessed, cleaned data as a new dataframe for better visibility.
# Does not include feature scaling or one-hot encoding.
model_data = features.copy()
model_data['anomaly_score'] = scores

# Add 'has_anomaly' column: 1 for scores <= 0.05 (potential scam), 0 otherwise
model_data['has_anomaly'] = (scores <= 0.05).astype(int)

# Saving this new DataFrame
model_data.to_csv('model_data_with_anomalies.csv', index=False)

# Count and print the number of listings with an anomaly score of .05 or less
num_anomalous_listings = (scores <= .05).sum()
print(f"Number of listings with anomaly score of 0.05 or less: {num_anomalous_listings}")

scores_df = pd.DataFrame({
'listing_link': data['_id'],
'anomaly_score': scores
})

# Save the scores and links to a CSV file
scores_df.to_csv('scores_and_links.csv', index=False)
Loading

1 comment on commit 09b1264

@vercel
Copy link

@vercel vercel bot commented on 09b1264 Nov 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

seniordesign – ./

smare.vercel.app
seniordesign-git-main-lryanle.vercel.app
seniordesign-lryanle.vercel.app
smare.lryanle.com

Please sign in to comment.