[MongoDB] Python Pymongo handling exemple

MongoDB Setup

  • MongoDB 연결
1
2
3
4
5
6
7
from pymongo import MongoClient

HOST = ''
USER = ''
PASSWORD = ''
DATABASE_NAME = ''
MONGO_URI = f"mongodb+srv://{USER}:{PASSWORD}@{HOST}/{DATABASE_NAME}?retryWrites=true&w=majority"

Data insert

insert_one

  • 하나의 도큐먼트 insert(Openweather API)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
openweather = {
  "coord": {
    "lon": -122.08,
    "lat": 37.39
  },
  "weather": [
    {
      "id": 800,
      "main": "Clear",
      "description": "clear sky",
      "icon": "01d"
    }
  ],
  "base": "stations",
  "main": {
    "temp": 282.55,
    "feels_like": 281.86,
    "temp_min": 280.37,
    "temp_max": 284.26,
    "pressure": 1023,
    "humidity": 100
  },
  "visibility": 16093,
  "wind": {
    "speed": 1.5,
    "deg": 350
  },
  "clouds": {
    "all": 1
  },
  "dt": 1560350645,
  "sys": {
    "type": 1,
    "id": 5122,
    "message": 0.0139,
    "country": "US",
    "sunrise": 1560343627,
    "sunset": 1560396563
  },
  "timezone": -25200,
  "id": 420006353,
  "name": "Mountain View",
  "cod": 200
  }

COLLECTION_NAME = 'openweather'
coll = MongoClient(MONGO_URI)[DATABASE_NAME][COLLECTION_NAME] # 컬렉션 연결

coll.insert_one(document=openweather)
print(coll.find_one()) # 하나의 도큐먼트
'''
{
   "_id":"ObjectId(""614d496f7abb8d643cf3dc82"")",
   "coord":{
      "lon":-122.08,
      "lat":37.39
   },
   "weather":[
      {
         "id":800,
         "main":"Clear",
         "description":"clear sky",
         "icon":"01d"
      }
   ],
   "base":"stations",
   "main":{
      "temp":282.55,
      "feels_like":281.86,
      "temp_min":280.37,
      "temp_max":284.26,
      "pressure":1023,
      "humidity":100
   },
   "visibility":16093,
   "wind":{
      "speed":1.5,
      "deg":350
   },
   "clouds":{
      "all":1
   },
   "dt":1560350645,
   "sys":{
      "type":1,
      "id":5122,
      "message":0.0139,
      "country":"US",
      "sunrise":1560343627,
      "sunset":1560396563
   },
   "timezone":-25200,
   "id":420006353,
   "name":"Mountain View",
   "cod":200
}
'''

insert_many

  • Github API 이용 여러가지 데이터 insert
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import time
import requests

sixmini = ''
response = requests.get("https://api.github.com/users/6mini/repos")
time.sleep(1) # github API는 호출이 많이 발생하면 자체적으로 제한을 걸 수 있어서 한번의 API 호출 후 1초 sleep 시간을 지정

if response.status_code == 200:
    sixmini = response.json() # 호출된 API 추가

COLLECTION_NAME = '6mini_repos'
coll = MongoClient(MONGO_URI)[DATABASE_NAME][COLLECTION_NAME]

coll.insert_many(sixmini)
for repo in coll.find(): # 모든 도큐먼트 find 후 for문으로 repo name 출력
    print(repo['name'])
'''
6mini
6mini.github.io
AI_Fashion_Analyzer
github-stats-box
Naver_Movie_Scraper
productive-box
Project
Sixtest
TIL
'''

insert CSV File

  • 타이타닉 CSV 파일을 JSON형태로 변경 후 insert
1
2
3
4
5
6
7
8
import pandas as pd
import json

data = json.loads(pd.read_csv('titanic.csv').to_json(orient='records')) # csv data json형태로 변경
COLLECTION_NAME = 'passengers'
coll = MongoClient(MONGO_URI)[DATABASE_NAME][COLLECTION_NAME]
coll.remove() # 도큐먼트를 비우는 메소드
coll.insert_many(data)

MongoDB to SQLite

  • 데이터 엔지니어의 경우 데이터 수집을 빠르게 진행하고자 NoSQL로 수집을 진행한 뒤, 모델링을 거쳐 분석이 가능하도록 데이터웨어하우스 혹은 데이터 마트 형태로 데이터를 전달한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import sqlite3

prlist = ''
response = requests.get("https://api.github.com/repos/codestates/ds-sa-nosql-issues/issues")
time.sleep(1)

if response.status_code == 200:
    prlist = response.json()

COLLECTION_NAME = 'PR_LIST'
DB_FILENAME = 'PR_LIST_RDB.db'
DB_FILEPATH = os.path.join(os.getcwd(), DB_FILENAME)
coll = MongoClient(MONGO_URI)[DATABASE_NAME][COLLECTION_NAME]

coll.insert_many(prlist)

conn = sqlite3.connect(DB_FILEPATH)
cur = conn.cursor()

# 스키마 생성
cur.execute("DROP TABLE IF EXISTS PRList;")
cur.execute("""CREATE TABLE PRList(
            id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
            node_id VARCHAR(20),
            title VARCHAR(100),
            body VARCHAR(100),
            created_at VARCHAR(20),
            comments INTEGER,
            userId INTEGER,
            labelId INTEGER
            );""")

cur.execute("DROP TABLE IF EXISTS Label;")
cur.execute("""CREATE TABLE Label(
            id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
            name VARCHAR(20),
            node_id VARCHAR(20),
            color VARCHAR(10),
            FOREIGN KEY(id) REFERENCES PRList(userId)
            );""")

cur.execute("DROP TABLE IF EXISTS User;")
cur.execute("""CREATE TABLE User(
            id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
            login VARCHAR(10),
            node_id VARCHAR(20),
            FOREIGN KEY(id) REFERENCES PRList(labelId)
            );""")

conn.commit()

# for문 이용 데이터 sqlite insert
for pr in coll.find():
    userId = pr['user']['id']
    labelId = pr['labels'][0]['id']

    cur.execute("""INSERT OR IGNORE INTO PRList (id, node_id, title, body, created_at, comments, userId, labelId) VALUES (?, ?, ?, ?, ?, ?, ?, ?);""", (pr['id'], pr['node_id'], pr['title'], pr['body'], pr['created_at'], pr['comments'], userId, labelId))

    cur.execute("""INSERT OR IGNORE INTO Label (id, name, node_id, color) VALUES (?, ?, ?, ?);""", (labelId, pr['labels'][0]['name'], pr['labels'][0]['node_id'], pr['labels'][0]['color']))

    cur.execute("""INSERT OR IGNORE INTO User (id, login, node_id) VALUES (?, ?, ?);""", (userId, pr['user']['login'], pr['user']['node_id']))

conn.commit()
0%