update

6 years ago · 2da5ca2500
parent 238d93a23a
commit 2da5ca2500
1 changed files with 38 additions and 1 deletions
--- a/chapter5/5.2Spark机器学习-坦克卫星图片识别分类.md
+++ b/chapter5/5.2Spark机器学习-坦克卫星图片识别分类.md
@ -148,11 +148,48 @@ print(type(img))


 #### 5.2.3 Spark 加载数据集
-
 上面我们已经把数据集处理完毕之后，然后我们可以通过`Spark`来加载数据集。

 ```
+import cv2
+import os
+import pandas as pd
+from pyspark.mllib.evaluation import BinaryClassificationMetrics
+from pyspark.sql import SparkSession
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.classification import LogisticRegression
+
+def get_file_path(root_path):
+    file_list = []
+    dir_or_files = os.listdir(root_path)
+    for dir_file in dir_or_files:
+        dir_file_path = os.path.join(root_path, dir_file)
+        file_list.append(dir_file_path)
+    return file_list
+
+
+def img2vector(imgfilename):
+    img = cv2.imread(imgfilename, cv2.IMREAD_GRAYSCALE)
+    rows, columns = img.shape
+    img = img.reshape(rows * columns)
+    # print(type(img))
+    return Vectors.dense(img)
+
+root_path = r"testing2"
+fileList = get_file_path(root_path)
+vectos = []
+for x in fileList:
+    vector = img2vector(x)
+    if "btr-70" in x:
+        label = 0
+    elif "t-72" in x:
+        label = 1
+    else:
+        label = 2
+    vectos.append((vector, label))
+df = pd.DataFrame(vectos, columns=['features', 'label'])
 spark = SparkSession.builder.master("local[*]").appName("demo").getOrCreate()
+spark.sparkContext.setLogLevel("error")
 sparkDF = spark.createDataFrame(df) # df是我们通过pandas构建的Dataframe
 ```