{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy :  0.9057717130739442\n",
      "+-----------------+-----+--------------+-------------+----------+\n",
      "|         features|label| rawPrediction|  probability|prediction|\n",
      "+-----------------+-----+--------------+-------------+----------+\n",
      "|[4.7,3.2,1.3,0.2]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[4.8,3.0,1.4,0.3]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[4.8,3.1,1.6,0.2]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[4.9,3.0,1.4,0.2]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[4.9,3.1,1.5,0.1]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[5.0,2.3,3.3,1.0]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[5.0,3.0,1.6,0.2]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[5.1,2.5,3.0,1.1]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[5.1,3.3,1.7,0.5]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[5.1,3.5,1.4,0.2]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[5.2,2.7,3.9,1.4]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[5.2,3.4,1.4,0.2]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[5.5,2.3,4.0,1.3]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[5.5,2.4,3.8,1.1]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[5.7,2.8,4.1,1.3]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[5.7,3.8,1.7,0.3]|  0.0|[40.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|\n",
      "|[5.8,2.7,5.1,1.9]|  2.0|[0.0,0.0,37.0]|[0.0,0.0,1.0]|       2.0|\n",
      "|[6.0,2.2,5.0,1.5]|  2.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[6.0,3.0,4.8,1.8]|  2.0| [0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "|[6.0,3.4,4.5,1.6]|  1.0|[0.0,34.0,0.0]|[0.0,1.0,0.0]|       1.0|\n",
      "+-----------------+-----+--------------+-------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.ml.feature import VectorAssembler,StringIndexer\n",
    "from pyspark.ml.classification import DecisionTreeClassifier\n",
    "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
    "\n",
    "spark = SparkSession.Builder().appName(\"DataFrame Introduction\").getOrCreate()\n",
    "irisDF = spark.read.option(\"delimiter\",\",\").option('inferSchema','true').option(\"header\",\"true\").csv(\"datasets/iris-dataset.txt\")\n",
    "\n",
    "indxr = StringIndexer(inputCol='class',outputCol='label')\n",
    "irisDF = indxr.fit(irisDF).transform(irisDF)\n",
    "vec = VectorAssembler(inputCols=['sepal-length','sepal-width','petal-length','petal-width'],outputCol='features')\n",
    "irisDF = vec.transform(irisDF)\n",
    "irisDF = irisDF.select('features','label')\n",
    "#irisDF.show()\n",
    "\n",
    "dtClassifier = DecisionTreeClassifier()\n",
    "\n",
    "trainDF, testDF = irisDF.randomSplit([0.8,0.2],seed=1222)\n",
    "model = dtClassifier.fit(trainDF)\n",
    "\n",
    "resultDF = model.transform(testDF)\n",
    "\n",
    "eva =MulticlassClassificationEvaluator()\n",
    "\n",
    "res = eva.evaluate(resultDF)\n",
    "print(\"Accuracy : \",res)\n",
    "resultDF.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy :  0.47799422799422797\n",
      "+-----------------+-----+--------------------+--------------------+----------+\n",
      "|         features|label|       rawPrediction|         probability|prediction|\n",
      "+-----------------+-----+--------------------+--------------------+----------+\n",
      "|[4.7,3.2,1.3,0.2]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[4.8,3.0,1.4,0.3]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[4.8,3.1,1.6,0.2]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[4.9,3.0,1.4,0.2]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[4.9,3.1,1.5,0.1]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[5.0,2.3,3.3,1.0]|  1.0|[-25.339887527668...|[1.04971248690950...|       2.0|\n",
      "|[5.0,3.0,1.6,0.2]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[5.1,2.5,3.0,1.1]|  1.0|[37.3235527507829...|[1.0,9.7390407293...|       0.0|\n",
      "|[5.1,3.3,1.7,0.5]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[5.1,3.5,1.4,0.2]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[5.2,2.7,3.9,1.4]|  1.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[5.2,3.4,1.4,0.2]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[5.5,2.3,4.0,1.3]|  1.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[5.5,2.4,3.8,1.1]|  1.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[5.7,2.8,4.1,1.3]|  1.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[5.7,3.8,1.7,0.3]|  0.0|[37.3235555364324...|[1.0,9.7390069693...|       0.0|\n",
      "|[5.8,2.7,5.1,1.9]|  2.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[6.0,2.2,5.0,1.5]|  2.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[6.0,3.0,4.8,1.8]|  2.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "|[6.0,3.4,4.5,1.6]|  1.0|[-25.339887544571...|[1.04971245296878...|       2.0|\n",
      "+-----------------+-----+--------------------+--------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.ml.feature import VectorAssembler,StringIndexer\n",
    "from pyspark.ml.classification import MultilayerPerceptronClassifier\n",
    "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
    "\n",
    "spark = SparkSession.Builder().appName(\"DataFrame Introduction\").getOrCreate()\n",
    "irisDF = spark.read.option(\"delimiter\",\",\").option('inferSchema','true').option(\"header\",\"true\").csv(\"hdfs://ipadres/user/ahmetdemirelli/datasets/iris-dataset.txt\")\n",
    "\n",
    "indxr = StringIndexer(inputCol='class',outputCol='label')\n",
    "irisDF = indxr.fit(irisDF).transform(irisDF)\n",
    "vec = VectorAssembler(inputCols=['sepal-length','sepal-width','petal-length','petal-width'],outputCol='features')\n",
    "irisDF = vec.transform(irisDF)\n",
    "irisDF = irisDF.select('features','label')\n",
    "#irisDF.show()\n",
    "\n",
    "dtClassifier = MultilayerPerceptronClassifier(layers=[4,2,3])\n",
    "\n",
    "trainDF, testDF = irisDF.randomSplit([0.8,0.2],seed=1222)\n",
    "model = dtClassifier.fit(trainDF)\n",
    "\n",
    "resultDF = model.transform(testDF)\n",
    "\n",
    "eva =MulticlassClassificationEvaluator()\n",
    "\n",
    "res = eva.evaluate(resultDF)\n",
    "print(\"Accuracy : \",res)\n",
    "resultDF.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Bu Kodları clsterda çalıştırmak içinadımlar\n",
    "#1) Veriyi HDFS e kopyala\n",
    "#2) Dosyayı HDFS ten okutmamız lazım (hdfs://10.0.0.1/user/ahmetdemirelli/datasets/iris-dataset.txt)\n",
    "#3) İşlemi şu şekilte başlat : spark-submit 02-SparkML-Classification.py\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
